397 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
		
		
			
		
	
	
			397 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			NASM
		
	
	
	
	
	
|   | ; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding | ||
|  | ; version for AMD64 on Windows using Microsoft C compiler | ||
|  | ; | ||
|  | ; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c | ||
|  | ; inffasx64.asm is called by inffas8664.c, which contain more info. | ||
|  | 
 | ||
|  | 
 | ||
|  | ; to compile this file, I use option | ||
|  | ;   ml64.exe /Flinffasx64 /c /Zi inffasx64.asm | ||
|  | ;   with Microsoft Macro Assembler (x64) for AMD64 | ||
|  | ; | ||
|  | 
 | ||
|  | ; This file compile with Microsoft Macro Assembler (x64) for AMD64 | ||
|  | ; | ||
|  | ;   ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK | ||
|  | ; | ||
|  | ;   (you can get Windows WDK with ml64 for AMD64 from | ||
|  | ;      http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price) | ||
|  | ; | ||
|  | 
 | ||
|  | 
 | ||
|  | .code | ||
|  | inffas8664fnc PROC | ||
|  | 
 | ||
|  | ; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and | ||
|  | ; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp | ||
|  | ; | ||
|  | ; All registers must be preserved across the call, except for | ||
|  | ;   rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch. | ||
|  | 
 | ||
|  | 
 | ||
|  | 	mov [rsp-8],rsi | ||
|  | 	mov [rsp-16],rdi | ||
|  | 	mov [rsp-24],r12 | ||
|  | 	mov [rsp-32],r13 | ||
|  | 	mov [rsp-40],r14 | ||
|  | 	mov [rsp-48],r15 | ||
|  | 	mov [rsp-56],rbx | ||
|  | 
 | ||
|  | 	mov rax,rcx | ||
|  | 
 | ||
|  | 	mov	[rax+8], rbp       ; /* save regs rbp and rsp */ | ||
|  | 	mov	[rax], rsp | ||
|  | 
 | ||
|  | 	mov	rsp, rax          ; /* make rsp point to &ar */ | ||
|  | 
 | ||
|  | 	mov	rsi, [rsp+16]      ; /* rsi  = in */ | ||
|  | 	mov	rdi, [rsp+32]      ; /* rdi  = out */ | ||
|  | 	mov	r9, [rsp+24]       ; /* r9   = last */ | ||
|  | 	mov	r10, [rsp+48]      ; /* r10  = end */ | ||
|  | 	mov	rbp, [rsp+64]      ; /* rbp  = lcode */ | ||
|  | 	mov	r11, [rsp+72]      ; /* r11  = dcode */ | ||
|  | 	mov	rdx, [rsp+80]      ; /* rdx  = hold */ | ||
|  | 	mov	ebx, [rsp+88]      ; /* ebx  = bits */ | ||
|  | 	mov	r12d, [rsp+100]    ; /* r12d = lmask */ | ||
|  | 	mov	r13d, [rsp+104]    ; /* r13d = dmask */ | ||
|  |                                           ; /* r14d = len */ | ||
|  |                                           ; /* r15d = dist */ | ||
|  | 
 | ||
|  | 
 | ||
|  | 	cld | ||
|  | 	cmp	r10, rdi | ||
|  | 	je	L_one_time           ; /* if only one decode left */ | ||
|  | 	cmp	r9, rsi | ||
|  | 
 | ||
|  |     jne L_do_loop | ||
|  | 
 | ||
|  | 
 | ||
|  | L_one_time: | ||
|  | 	mov	r8, r12           ; /* r8 = lmask */ | ||
|  | 	cmp	bl, 32 | ||
|  | 	ja	L_get_length_code_one_time | ||
|  | 
 | ||
|  | 	lodsd                         ; /* eax = *(uint *)in++ */ | ||
|  | 	mov	cl, bl            ; /* cl = bits, needs it for shifting */ | ||
|  | 	add	bl, 32             ; /* bits += 32 */ | ||
|  | 	shl	rax, cl | ||
|  | 	or	rdx, rax          ; /* hold |= *((uint *)in)++ << bits */ | ||
|  | 	jmp	L_get_length_code_one_time | ||
|  | 
 | ||
|  | ALIGN 4 | ||
|  | L_while_test: | ||
|  | 	cmp	r10, rdi | ||
|  | 	jbe	L_break_loop | ||
|  | 	cmp	r9, rsi | ||
|  | 	jbe	L_break_loop | ||
|  | 
 | ||
|  | L_do_loop: | ||
|  | 	mov	r8, r12           ; /* r8 = lmask */ | ||
|  | 	cmp	bl, 32 | ||
|  | 	ja	L_get_length_code    ; /* if (32 < bits) */ | ||
|  | 
 | ||
|  | 	lodsd                         ; /* eax = *(uint *)in++ */ | ||
|  | 	mov	cl, bl            ; /* cl = bits, needs it for shifting */ | ||
|  | 	add	bl, 32             ; /* bits += 32 */ | ||
|  | 	shl	rax, cl | ||
|  | 	or	rdx, rax          ; /* hold |= *((uint *)in)++ << bits */ | ||
|  | 
 | ||
|  | L_get_length_code: | ||
|  | 	and	r8, rdx            ; /* r8 &= hold */ | ||
|  | 	mov	eax, [rbp+r8*4]  ; /* eax = lcode[hold & lmask] */ | ||
|  | 
 | ||
|  | 	mov	cl, ah            ; /* cl = this.bits */ | ||
|  | 	sub	bl, ah            ; /* bits -= this.bits */ | ||
|  | 	shr	rdx, cl           ; /* hold >>= this.bits */ | ||
|  | 
 | ||
|  | 	test	al, al | ||
|  | 	jnz	L_test_for_length_base ; /* if (op != 0) 45.7% */ | ||
|  | 
 | ||
|  | 	mov	r8, r12            ; /* r8 = lmask */ | ||
|  | 	shr	eax, 16            ; /* output this.val char */ | ||
|  | 	stosb | ||
|  | 
 | ||
|  | L_get_length_code_one_time: | ||
|  | 	and	r8, rdx            ; /* r8 &= hold */ | ||
|  | 	mov	eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */ | ||
|  | 
 | ||
|  | L_dolen: | ||
|  | 	mov	cl, ah            ; /* cl = this.bits */ | ||
|  | 	sub	bl, ah            ; /* bits -= this.bits */ | ||
|  | 	shr	rdx, cl           ; /* hold >>= this.bits */ | ||
|  | 
 | ||
|  | 	test	al, al | ||
|  | 	jnz	L_test_for_length_base ; /* if (op != 0) 45.7% */ | ||
|  | 
 | ||
|  | 	shr	eax, 16            ; /* output this.val char */ | ||
|  | 	stosb | ||
|  | 	jmp	L_while_test | ||
|  | 
 | ||
|  | ALIGN 4 | ||
|  | L_test_for_length_base: | ||
|  | 	mov	r14d, eax         ; /* len = this */ | ||
|  | 	shr	r14d, 16           ; /* len = this.val */ | ||
|  | 	mov	cl, al | ||
|  | 
 | ||
|  | 	test	al, 16 | ||
|  | 	jz	L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */ | ||
|  | 	and	cl, 15             ; /* op &= 15 */ | ||
|  | 	jz	L_decode_distance    ; /* if (!op) */ | ||
|  | 
 | ||
|  | L_add_bits_to_len: | ||
|  | 	sub	bl, cl | ||
|  | 	xor	eax, eax | ||
|  | 	inc	eax | ||
|  | 	shl	eax, cl | ||
|  | 	dec	eax | ||
|  | 	and	eax, edx          ; /* eax &= hold */ | ||
|  | 	shr	rdx, cl | ||
|  | 	add	r14d, eax         ; /* len += hold & mask[op] */ | ||
|  | 
 | ||
|  | L_decode_distance: | ||
|  | 	mov	r8, r13           ; /* r8 = dmask */ | ||
|  | 	cmp	bl, 32 | ||
|  | 	ja	L_get_distance_code  ; /* if (32 < bits) */ | ||
|  | 
 | ||
|  | 	lodsd                         ; /* eax = *(uint *)in++ */ | ||
|  | 	mov	cl, bl            ; /* cl = bits, needs it for shifting */ | ||
|  | 	add	bl, 32             ; /* bits += 32 */ | ||
|  | 	shl	rax, cl | ||
|  | 	or	rdx, rax          ; /* hold |= *((uint *)in)++ << bits */ | ||
|  | 
 | ||
|  | L_get_distance_code: | ||
|  | 	and	r8, rdx           ; /* r8 &= hold */ | ||
|  | 	mov	eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */ | ||
|  | 
 | ||
|  | L_dodist: | ||
|  | 	mov	r15d, eax         ; /* dist = this */ | ||
|  | 	shr	r15d, 16           ; /* dist = this.val */ | ||
|  | 	mov	cl, ah | ||
|  | 	sub	bl, ah            ; /* bits -= this.bits */ | ||
|  | 	shr	rdx, cl           ; /* hold >>= this.bits */ | ||
|  | 	mov	cl, al            ; /* cl = this.op */ | ||
|  | 
 | ||
|  | 	test	al, 16             ; /* if ((op & 16) == 0) */ | ||
|  | 	jz	L_test_for_second_level_dist | ||
|  | 	and	cl, 15             ; /* op &= 15 */ | ||
|  | 	jz	L_check_dist_one | ||
|  | 
 | ||
|  | L_add_bits_to_dist: | ||
|  | 	sub	bl, cl | ||
|  | 	xor	eax, eax | ||
|  | 	inc	eax | ||
|  | 	shl	eax, cl | ||
|  | 	dec	eax                 ; /* (1 << op) - 1 */ | ||
|  | 	and	eax, edx          ; /* eax &= hold */ | ||
|  | 	shr	rdx, cl | ||
|  | 	add	r15d, eax         ; /* dist += hold & ((1 << op) - 1) */ | ||
|  | 
 | ||
|  | L_check_window: | ||
|  | 	mov	r8, rsi           ; /* save in so from can use it's reg */ | ||
|  | 	mov	rax, rdi | ||
|  | 	sub	rax, [rsp+40]      ; /* nbytes = out - beg */ | ||
|  | 
 | ||
|  | 	cmp	eax, r15d | ||
|  | 	jb	L_clip_window        ; /* if (dist > nbytes) 4.2% */ | ||
|  | 
 | ||
|  | 	mov	ecx, r14d         ; /* ecx = len */ | ||
|  | 	mov	rsi, rdi | ||
|  | 	sub	rsi, r15          ; /* from = out - dist */ | ||
|  | 
 | ||
|  | 	sar	ecx, 1 | ||
|  | 	jnc	L_copy_two           ; /* if len % 2 == 0 */ | ||
|  | 
 | ||
|  | 	rep     movsw | ||
|  | 	mov	al, [rsi] | ||
|  | 	mov	[rdi], al | ||
|  | 	inc	rdi | ||
|  | 
 | ||
|  | 	mov	rsi, r8           ; /* move in back to %rsi, toss from */ | ||
|  | 	jmp	L_while_test | ||
|  | 
 | ||
|  | L_copy_two: | ||
|  | 	rep     movsw | ||
|  | 	mov	rsi, r8           ; /* move in back to %rsi, toss from */ | ||
|  | 	jmp	L_while_test | ||
|  | 
 | ||
|  | ALIGN 4 | ||
|  | L_check_dist_one: | ||
|  | 	cmp	r15d, 1            ; /* if dist 1, is a memset */ | ||
|  | 	jne	L_check_window | ||
|  | 	cmp	[rsp+40], rdi      ; /* if out == beg, outside window */ | ||
|  | 	je	L_check_window | ||
|  | 
 | ||
|  | 	mov	ecx, r14d         ; /* ecx = len */ | ||
|  | 	mov	al, [rdi-1] | ||
|  | 	mov	ah, al | ||
|  | 
 | ||
|  | 	sar	ecx, 1 | ||
|  | 	jnc	L_set_two | ||
|  | 	mov	[rdi], al | ||
|  | 	inc	rdi | ||
|  | 
 | ||
|  | L_set_two: | ||
|  | 	rep     stosw | ||
|  | 	jmp	L_while_test | ||
|  | 
 | ||
|  | ALIGN 4 | ||
|  | L_test_for_second_level_length: | ||
|  | 	test	al, 64 | ||
|  | 	jnz	L_test_for_end_of_block ; /* if ((op & 64) != 0) */ | ||
|  | 
 | ||
|  | 	xor	eax, eax | ||
|  | 	inc	eax | ||
|  | 	shl	eax, cl | ||
|  | 	dec	eax | ||
|  | 	and	eax, edx         ; /* eax &= hold */ | ||
|  | 	add	eax, r14d        ; /* eax += len */ | ||
|  | 	mov	eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/ | ||
|  | 	jmp	L_dolen | ||
|  | 
 | ||
|  | ALIGN 4 | ||
|  | L_test_for_second_level_dist: | ||
|  | 	test	al, 64 | ||
|  | 	jnz	L_invalid_distance_code ; /* if ((op & 64) != 0) */ | ||
|  | 
 | ||
|  | 	xor	eax, eax | ||
|  | 	inc	eax | ||
|  | 	shl	eax, cl | ||
|  | 	dec	eax | ||
|  | 	and	eax, edx         ; /* eax &= hold */ | ||
|  | 	add	eax, r15d        ; /* eax += dist */ | ||
|  | 	mov	eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/ | ||
|  | 	jmp	L_dodist | ||
|  | 
 | ||
|  | ALIGN 4 | ||
|  | L_clip_window: | ||
|  | 	mov	ecx, eax         ; /* ecx = nbytes */ | ||
|  | 	mov	eax, [rsp+92]     ; /* eax = wsize, prepare for dist cmp */ | ||
|  | 	neg	ecx                ; /* nbytes = -nbytes */ | ||
|  | 
 | ||
|  | 	cmp	eax, r15d | ||
|  | 	jb	L_invalid_distance_too_far ; /* if (dist > wsize) */ | ||
|  | 
 | ||
|  | 	add	ecx, r15d         ; /* nbytes = dist - nbytes */ | ||
|  | 	cmp	dword ptr [rsp+96], 0 | ||
|  | 	jne	L_wrap_around_window ; /* if (write != 0) */ | ||
|  | 
 | ||
|  | 	mov	rsi, [rsp+56]     ; /* from  = window */ | ||
|  | 	sub	eax, ecx         ; /* eax  -= nbytes */ | ||
|  | 	add	rsi, rax         ; /* from += wsize - nbytes */ | ||
|  | 
 | ||
|  | 	mov	eax, r14d        ; /* eax = len */ | ||
|  | 	cmp	r14d, ecx | ||
|  | 	jbe	L_do_copy           ; /* if (nbytes >= len) */ | ||
|  | 
 | ||
|  | 	sub	eax, ecx         ; /* eax -= nbytes */ | ||
|  | 	rep     movsb | ||
|  | 	mov	rsi, rdi | ||
|  | 	sub	rsi, r15         ; /* from = &out[ -dist ] */ | ||
|  | 	jmp	L_do_copy | ||
|  | 
 | ||
|  | ALIGN 4 | ||
|  | L_wrap_around_window: | ||
|  | 	mov	eax, [rsp+96]     ; /* eax = write */ | ||
|  | 	cmp	ecx, eax | ||
|  | 	jbe	L_contiguous_in_window ; /* if (write >= nbytes) */ | ||
|  | 
 | ||
|  | 	mov	esi, [rsp+92]     ; /* from  = wsize */ | ||
|  | 	add	rsi, [rsp+56]     ; /* from += window */ | ||
|  | 	add	rsi, rax         ; /* from += write */ | ||
|  | 	sub	rsi, rcx         ; /* from -= nbytes */ | ||
|  | 	sub	ecx, eax         ; /* nbytes -= write */ | ||
|  | 
 | ||
|  | 	mov	eax, r14d        ; /* eax = len */ | ||
|  | 	cmp	eax, ecx | ||
|  | 	jbe	L_do_copy           ; /* if (nbytes >= len) */ | ||
|  | 
 | ||
|  | 	sub	eax, ecx         ; /* len -= nbytes */ | ||
|  | 	rep     movsb | ||
|  | 	mov	rsi, [rsp+56]     ; /* from = window */ | ||
|  | 	mov	ecx, [rsp+96]     ; /* nbytes = write */ | ||
|  | 	cmp	eax, ecx | ||
|  | 	jbe	L_do_copy           ; /* if (nbytes >= len) */ | ||
|  | 
 | ||
|  | 	sub	eax, ecx         ; /* len -= nbytes */ | ||
|  | 	rep     movsb | ||
|  | 	mov	rsi, rdi | ||
|  | 	sub	rsi, r15         ; /* from = out - dist */ | ||
|  | 	jmp	L_do_copy | ||
|  | 
 | ||
|  | ALIGN 4 | ||
|  | L_contiguous_in_window: | ||
|  | 	mov	rsi, [rsp+56]     ; /* rsi = window */ | ||
|  | 	add	rsi, rax | ||
|  | 	sub	rsi, rcx         ; /* from += write - nbytes */ | ||
|  | 
 | ||
|  | 	mov	eax, r14d        ; /* eax = len */ | ||
|  | 	cmp	eax, ecx | ||
|  | 	jbe	L_do_copy           ; /* if (nbytes >= len) */ | ||
|  | 
 | ||
|  | 	sub	eax, ecx         ; /* len -= nbytes */ | ||
|  | 	rep     movsb | ||
|  | 	mov	rsi, rdi | ||
|  | 	sub	rsi, r15         ; /* from = out - dist */ | ||
|  | 	jmp	L_do_copy           ; /* if (nbytes >= len) */ | ||
|  | 
 | ||
|  | ALIGN 4 | ||
|  | L_do_copy: | ||
|  | 	mov	ecx, eax         ; /* ecx = len */ | ||
|  | 	rep     movsb | ||
|  | 
 | ||
|  | 	mov	rsi, r8          ; /* move in back to %esi, toss from */ | ||
|  | 	jmp	L_while_test | ||
|  | 
 | ||
|  | L_test_for_end_of_block: | ||
|  | 	test	al, 32 | ||
|  | 	jz	L_invalid_literal_length_code | ||
|  | 	mov	dword ptr [rsp+116], 1 | ||
|  | 	jmp	L_break_loop_with_status | ||
|  | 
 | ||
|  | L_invalid_literal_length_code: | ||
|  | 	mov	dword ptr [rsp+116], 2 | ||
|  | 	jmp	L_break_loop_with_status | ||
|  | 
 | ||
|  | L_invalid_distance_code: | ||
|  | 	mov	dword ptr [rsp+116], 3 | ||
|  | 	jmp	L_break_loop_with_status | ||
|  | 
 | ||
|  | L_invalid_distance_too_far: | ||
|  | 	mov	dword ptr [rsp+116], 4 | ||
|  | 	jmp	L_break_loop_with_status | ||
|  | 
 | ||
|  | L_break_loop: | ||
|  | 	mov	dword ptr [rsp+116], 0 | ||
|  | 
 | ||
|  | L_break_loop_with_status: | ||
|  | ; /* put in, out, bits, and hold back into ar and pop esp */ | ||
|  | 	mov	[rsp+16], rsi     ; /* in */ | ||
|  | 	mov	[rsp+32], rdi     ; /* out */ | ||
|  | 	mov	[rsp+88], ebx     ; /* bits */ | ||
|  | 	mov	[rsp+80], rdx     ; /* hold */ | ||
|  | 
 | ||
|  | 	mov	rax, [rsp]       ; /* restore rbp and rsp */ | ||
|  | 	mov	rbp, [rsp+8] | ||
|  | 	mov	rsp, rax | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | 	mov rsi,[rsp-8] | ||
|  | 	mov rdi,[rsp-16] | ||
|  | 	mov r12,[rsp-24] | ||
|  | 	mov r13,[rsp-32] | ||
|  | 	mov r14,[rsp-40] | ||
|  | 	mov r15,[rsp-48] | ||
|  | 	mov rbx,[rsp-56] | ||
|  | 
 | ||
|  |     ret 0 | ||
|  | ;          : | ||
|  | ;          : "m" (ar) | ||
|  | ;          : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", | ||
|  | ;            "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" | ||
|  | ;    ); | ||
|  | 
 | ||
|  | inffas8664fnc 	ENDP | ||
|  | ;_TEXT	ENDS | ||
|  | END |