Looking at memset's implementation

Not all LibC functions are written in C. Some are written in assembler because they are optimized to take in consideration the data size and alignment of LibC call arguments. This post describes memset based on GDB's disassembly.

void * memset(void *b, int c, size_t len);

Following the 64 bit ABI, the parameters to the function are set as follows:

b is the destination ($rdi)
c is the character to fill the destination with ($rsi) (my example uses 0x38 - character '8')
len is the length ($rdx)

The first thing memset does is to check if the character argument is 0 (logical AND with 0xFF)
If it is, the memset call is equivalent to a bzero call and the code jumps to the bzero implementation (0x7fffffe00600).

0x00007fff8749f414 <memset+0>: and    $0xff,%esi
0x00007fff8749f41a <memset+6>: jne 0x7fff8749f42b <memset+23>


RSI is zero - this memset call is equivalent to a bzero call (0x7fffffe00600).
0x00007fff8749f41c <memset+8>: mov    $0x7fffffe00600,%rax
0x00007fff8749f426 <memset+18>: mov %rdx,%rsi
0x00007fff8749f429 <memset+21>: jmpq *%rax


RSI is non-zero.
Save the destination address to R8 (it will be used later on as return value of memset.
0x00007fff8749f42b <memset+23>: mov    %rdi,%r8


The following section fills each byte of EAX with the character argument (0x38383838)



0x00007fff8749f42e <memset+26>: mov    %esi,%eax
0x00007fff8749f430 <memset+28>: shl $0x8,%esi
0x00007fff8749f433 <memset+31>: or %esi,%eax
0x00007fff8749f435 <memset+33>: mov %eax,%esi
0x00007fff8749f437 <memset+35>: shl $0x10,%esi
0x00007fff8749f43a <memset+38>: or %esi,%eax


Check if the size of the fill is bigger then 256 bytes.
0x00007fff8749f43c <memset+40>: cmp    $0xff,%rdx
0x00007fff8749f443 <memset+47>: ja 0x7fff8749f490 <memset+124>


Check if the size of the fill is bigger then 16 bytes.
0x00007fff8749f445 <memset+49>: cmp    $0x10,%edx
0x00007fff8749f448 <memset+52>: jge 0x7fff8749f464 <memset+80>


Check if the size of the fill is zero.
0x00007fff8749f44a <memset+54>: test   %edx,%edx
0x00007fff8749f44c <memset+56>: je 0x7fff8749f48c <memset+120>


Size of fill is less then 16 bytes but not zero.
Fill is the destination buffer (RDI is pointing to it) one byte at a time, decrementing RDX at each step.
Continue until RDX is zero, and jump at function end.
0x00007fff8749f44e <memset+58>: mov    %al,(%rdi)
0x00007fff8749f450 <memset+60>: add $0x1,%rdi
0x00007fff8749f454 <memset+64>: sub $0x1,%edx
0x00007fff8749f457 <memset+67>: jne 0x7fff8749f44e <memset+58>
0x00007fff8749f459 <memset+69>: jmp 0x7fff8749f48c <memset+120>

0x00007fff8749f45b <memset+71>: mov    %al,(%rdi)
0x00007fff8749f45d <memset+73>: add $0x1,%rdi
0x00007fff8749f461 <memset+77>: sub $0x1,%edx


Size of fill is more then 16 bytes.
If the destination is not 4 byte aligned, fill in the first bytes until the destination is aligned.
0x00007fff8749f464 <memset+80>: test   $0x3,%edi
0x00007fff8749f46a <memset+86>: jne 0x7fff8749f45b <memset+71>


The destination is now aligned, so it can be filled in 4 bytes at a time.
0x00007fff8749f46c <memset+88>: mov    %edx,%ecx
0x00007fff8749f46e <memset+90>: shr $0x2,%edx
0x00007fff8749f471 <memset+93>: mov %eax,(%rdi)
0x00007fff8749f473 <memset+95>: add $0x4,%rdi
0x00007fff8749f477 <memset+99>: sub $0x1,%edx
0x00007fff8749f47a <memset+102>: jne 0x7fff8749f471 <memset+93>


If the size was also a multiple of 4, we're done.
0x00007fff8749f47c <memset+104>: and    $0x3,%ecx
0x00007fff8749f47f <memset+107>: je 0x7fff8749f48c <memset+120>


If the size was not a multiple of 4, there are some leftovers to copy.
0x00007fff8749f481 <memset+109>: mov    %al,(%rdi)
0x00007fff8749f483 <memset+111>: add $0x1,%rdi
0x00007fff8749f487 <memset+115>: sub $0x1,%ecx
0x00007fff8749f48a <memset+118>: jne 0x7fff8749f481 <memset+109>


Load RAX with the value of R8 (pointer to fill buffer saved at beginning) and return.
0x00007fff8749f48c <memset+120>: mov    %r8,%rax
0x00007fff8749f48f <memset+123>: retq


A fill operation with more then 256 bytes.
This is done using MMS instructions. xmm0 is set to 16 bytes of 0x38 (our character)
0x00007fff8749f490 <memset+124>: movd   %eax,%xmm0
0x00007fff8749f494 <memset+128>: pshufd $0x0,%xmm0,%xmm0


Check for alignment of destination.
0x00007fff8749f499 <memset+133>: mov    %rdi,%rcx
0x00007fff8749f49c <memset+136>: neg %ecx
0x00007fff8749f49e <memset+138>: and $0xf,%ecx
0x00007fff8749f4a1 <memset+141>: je 0x7fff8749f4b1 <memset+157>


The destination is not 16 byte aligned, copy some bytes until rest of destination will be aligned.
0x00007fff8749f4a3 <memset+143>: sub    %rcx,%rdx
0x00007fff8749f4a6 <memset+146>: mov %al,(%rdi)
0x00007fff8749f4a8 <memset+148>: add $0x1,%rdi
0x00007fff8749f4ac <memset+152>: sub $0x1,%ecx
0x00007fff8749f4af <memset+155>: jne 0x7fff8749f4a6 <memset+146>


Call _memset_pattern.
0x00007fff8749f4b1 <memset+157>: mov    $0x7fffffe01000,%rax
0x00007fff8749f4bb <memset+167>: callq *%rax


Load RAX with the value of R8 (pointer to fill buffer saved at beginning).
0x00007fff8749f4bd <memset+169>: mov    %r8,%rax
0x00007fff8749f4c0 <memset+172>: retq