From 288f1080317b954b6bdca33708631c011549c008 Mon Sep 17 00:00:00 2001 From: Mark Adler Date: Thu, 12 Oct 2017 20:08:53 -0700 Subject: [PATCH] Remove old assembler code in which bugs have manifested. In addition, there is not sufficient gain from the inflate assembler code to warrant its inclusion. --- contrib/README.contrib | 21 - contrib/amd64/amd64-match.S | 452 ----------- contrib/asm686/README.686 | 51 -- contrib/asm686/match.S | 357 --------- contrib/inflate86/inffas86.c | 1157 ---------------------------- contrib/inflate86/inffast.S | 1368 --------------------------------- contrib/masmx64/bld_ml64.bat | 2 - contrib/masmx64/gvmat64.asm | 553 ------------- contrib/masmx64/inffas8664.c | 186 ----- contrib/masmx64/inffasx64.asm | 396 ---------- contrib/masmx64/readme.txt | 31 - contrib/masmx86/bld_ml32.bat | 2 - contrib/masmx86/inffas32.asm | 1080 -------------------------- contrib/masmx86/match686.asm | 479 ------------ contrib/masmx86/readme.txt | 27 - win32/Makefile.bor | 1 - win32/Makefile.gcc | 5 - win32/Makefile.msc | 4 - 18 files changed, 6172 deletions(-) delete mode 100644 contrib/amd64/amd64-match.S delete mode 100644 contrib/asm686/README.686 delete mode 100644 contrib/asm686/match.S delete mode 100644 contrib/inflate86/inffas86.c delete mode 100644 contrib/inflate86/inffast.S delete mode 100644 contrib/masmx64/bld_ml64.bat delete mode 100644 contrib/masmx64/gvmat64.asm delete mode 100644 contrib/masmx64/inffas8664.c delete mode 100644 contrib/masmx64/inffasx64.asm delete mode 100644 contrib/masmx64/readme.txt delete mode 100644 contrib/masmx86/bld_ml32.bat delete mode 100644 contrib/masmx86/inffas32.asm delete mode 100644 contrib/masmx86/match686.asm delete mode 100644 contrib/masmx86/readme.txt diff --git a/contrib/README.contrib b/contrib/README.contrib index a411d5c..335e435 100644 --- a/contrib/README.contrib +++ b/contrib/README.contrib @@ -8,14 +8,6 @@ ada/ by Dmitriy Anisimkov Support for Ada See http://zlib-ada.sourceforge.net/ -amd64/ by Mikhail Teterin - asm code for AMD64 - See patch at http://www.freebsd.org/cgi/query-pr.cgi?pr=bin/96393 - -asm686/ by Brian Raiter - asm code for Pentium and PPro/PII, using the AT&T (GNU as) syntax - See http://www.muppetlabs.com/~breadbox/software/assembly.html - blast/ by Mark Adler Decompressor for output of PKWare Data Compression Library (DCL) @@ -32,9 +24,6 @@ gcc_gvmat64/by Gilles Vollant infback9/ by Mark Adler Unsupported diffs to infback to decode the deflate64 format -inflate86/ by Chris Anderson - Tuned x86 gcc asm code to replace inflate_fast() - iostream/ by Kevin Ruland A C++ I/O streams interface to the zlib gz* functions @@ -45,16 +34,6 @@ iostream3/ by Ludwig Schwardt and Kevin Ruland Yet another C++ I/O streams interface -masmx64/ by Gilles Vollant - x86 64-bit (AMD64 and Intel EM64t) code for x64 assembler to - replace longest_match() and inflate_fast(), also masm x86 - 64-bits translation of Chris Anderson inflate_fast() - -masmx86/ by Gilles Vollant - x86 asm code to replace longest_match() and inflate_fast(), - for Visual C++ and MASM (32 bits). - Based on Brian Raiter (asm686) and Chris Anderson (inflate86) - minizip/ by Gilles Vollant Mini zip and unzip based on zlib Includes Zip64 support by Mathias Svensson diff --git a/contrib/amd64/amd64-match.S b/contrib/amd64/amd64-match.S deleted file mode 100644 index 81d4a1c..0000000 --- a/contrib/amd64/amd64-match.S +++ /dev/null @@ -1,452 +0,0 @@ -/* - * match.S -- optimized version of longest_match() - * based on the similar work by Gilles Vollant, and Brian Raiter, written 1998 - * - * This is free software; you can redistribute it and/or modify it - * under the terms of the BSD License. Use by owners of Che Guevarra - * parafernalia is prohibited, where possible, and highly discouraged - * elsewhere. - */ - -#ifndef NO_UNDERLINE -# define match_init _match_init -# define longest_match _longest_match -#endif - -#define scanend ebx -#define scanendw bx -#define chainlenwmask edx /* high word: current chain len low word: s->wmask */ -#define curmatch rsi -#define curmatchd esi -#define windowbestlen r8 -#define scanalign r9 -#define scanalignd r9d -#define window r10 -#define bestlen r11 -#define bestlend r11d -#define scanstart r12d -#define scanstartw r12w -#define scan r13 -#define nicematch r14d -#define limit r15 -#define limitd r15d -#define prev rcx - -/* - * The 258 is a "magic number, not a parameter -- changing it - * breaks the hell loose - */ -#define MAX_MATCH (258) -#define MIN_MATCH (3) -#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) -#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) - -/* stack frame offsets */ -#define LocalVarsSize (112) -#define _chainlenwmask ( 8-LocalVarsSize)(%rsp) -#define _windowbestlen (16-LocalVarsSize)(%rsp) -#define save_r14 (24-LocalVarsSize)(%rsp) -#define save_rsi (32-LocalVarsSize)(%rsp) -#define save_rbx (40-LocalVarsSize)(%rsp) -#define save_r12 (56-LocalVarsSize)(%rsp) -#define save_r13 (64-LocalVarsSize)(%rsp) -#define save_r15 (80-LocalVarsSize)(%rsp) - - -.globl match_init, longest_match - -/* - * On AMD64 the first argument of a function (in our case -- the pointer to - * deflate_state structure) is passed in %rdi, hence our offsets below are - * all off of that. - */ - -/* you can check the structure offset by running - -#include -#include -#include "deflate.h" - -void print_depl() -{ -deflate_state ds; -deflate_state *s=&ds; -printf("size pointer=%u\n",(int)sizeof(void*)); - -printf("#define dsWSize (%3u)(%%rdi)\n",(int)(((char*)&(s->w_size))-((char*)s))); -printf("#define dsWMask (%3u)(%%rdi)\n",(int)(((char*)&(s->w_mask))-((char*)s))); -printf("#define dsWindow (%3u)(%%rdi)\n",(int)(((char*)&(s->window))-((char*)s))); -printf("#define dsPrev (%3u)(%%rdi)\n",(int)(((char*)&(s->prev))-((char*)s))); -printf("#define dsMatchLen (%3u)(%%rdi)\n",(int)(((char*)&(s->match_length))-((char*)s))); -printf("#define dsPrevMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_match))-((char*)s))); -printf("#define dsStrStart (%3u)(%%rdi)\n",(int)(((char*)&(s->strstart))-((char*)s))); -printf("#define dsMatchStart (%3u)(%%rdi)\n",(int)(((char*)&(s->match_start))-((char*)s))); -printf("#define dsLookahead (%3u)(%%rdi)\n",(int)(((char*)&(s->lookahead))-((char*)s))); -printf("#define dsPrevLen (%3u)(%%rdi)\n",(int)(((char*)&(s->prev_length))-((char*)s))); -printf("#define dsMaxChainLen (%3u)(%%rdi)\n",(int)(((char*)&(s->max_chain_length))-((char*)s))); -printf("#define dsGoodMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->good_match))-((char*)s))); -printf("#define dsNiceMatch (%3u)(%%rdi)\n",(int)(((char*)&(s->nice_match))-((char*)s))); -} - -*/ - - -/* - to compile for XCode 3.2 on MacOSX x86_64 - - run "gcc -g -c -DXCODE_MAC_X64_STRUCTURE amd64-match.S" - */ - - -#ifndef CURRENT_LINX_XCODE_MAC_X64_STRUCTURE -#define dsWSize ( 68)(%rdi) -#define dsWMask ( 76)(%rdi) -#define dsWindow ( 80)(%rdi) -#define dsPrev ( 96)(%rdi) -#define dsMatchLen (144)(%rdi) -#define dsPrevMatch (148)(%rdi) -#define dsStrStart (156)(%rdi) -#define dsMatchStart (160)(%rdi) -#define dsLookahead (164)(%rdi) -#define dsPrevLen (168)(%rdi) -#define dsMaxChainLen (172)(%rdi) -#define dsGoodMatch (188)(%rdi) -#define dsNiceMatch (192)(%rdi) - -#else - -#ifndef STRUCT_OFFSET -# define STRUCT_OFFSET (0) -#endif - - -#define dsWSize ( 56 + STRUCT_OFFSET)(%rdi) -#define dsWMask ( 64 + STRUCT_OFFSET)(%rdi) -#define dsWindow ( 72 + STRUCT_OFFSET)(%rdi) -#define dsPrev ( 88 + STRUCT_OFFSET)(%rdi) -#define dsMatchLen (136 + STRUCT_OFFSET)(%rdi) -#define dsPrevMatch (140 + STRUCT_OFFSET)(%rdi) -#define dsStrStart (148 + STRUCT_OFFSET)(%rdi) -#define dsMatchStart (152 + STRUCT_OFFSET)(%rdi) -#define dsLookahead (156 + STRUCT_OFFSET)(%rdi) -#define dsPrevLen (160 + STRUCT_OFFSET)(%rdi) -#define dsMaxChainLen (164 + STRUCT_OFFSET)(%rdi) -#define dsGoodMatch (180 + STRUCT_OFFSET)(%rdi) -#define dsNiceMatch (184 + STRUCT_OFFSET)(%rdi) - -#endif - - - - -.text - -/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ - -longest_match: -/* - * Retrieve the function arguments. %curmatch will hold cur_match - * throughout the entire function (passed via rsi on amd64). - * rdi will hold the pointer to the deflate_state (first arg on amd64) - */ - mov %rsi, save_rsi - mov %rbx, save_rbx - mov %r12, save_r12 - mov %r13, save_r13 - mov %r14, save_r14 - mov %r15, save_r15 - -/* uInt wmask = s->w_mask; */ -/* unsigned chain_length = s->max_chain_length; */ -/* if (s->prev_length >= s->good_match) { */ -/* chain_length >>= 2; */ -/* } */ - - movl dsPrevLen, %eax - movl dsGoodMatch, %ebx - cmpl %ebx, %eax - movl dsWMask, %eax - movl dsMaxChainLen, %chainlenwmask - jl LastMatchGood - shrl $2, %chainlenwmask -LastMatchGood: - -/* chainlen is decremented once beforehand so that the function can */ -/* use the sign flag instead of the zero flag for the exit test. */ -/* It is then shifted into the high word, to make room for the wmask */ -/* value, which it will always accompany. */ - - decl %chainlenwmask - shll $16, %chainlenwmask - orl %eax, %chainlenwmask - -/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ - - movl dsNiceMatch, %eax - movl dsLookahead, %ebx - cmpl %eax, %ebx - jl LookaheadLess - movl %eax, %ebx -LookaheadLess: movl %ebx, %nicematch - -/* register Bytef *scan = s->window + s->strstart; */ - - mov dsWindow, %window - movl dsStrStart, %limitd - lea (%limit, %window), %scan - -/* Determine how many bytes the scan ptr is off from being */ -/* dword-aligned. */ - - mov %scan, %scanalign - negl %scanalignd - andl $3, %scanalignd - -/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ -/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ - - movl dsWSize, %eax - subl $MIN_LOOKAHEAD, %eax - xorl %ecx, %ecx - subl %eax, %limitd - cmovng %ecx, %limitd - -/* int best_len = s->prev_length; */ - - movl dsPrevLen, %bestlend - -/* Store the sum of s->window + best_len in %windowbestlen locally, and in memory. */ - - lea (%window, %bestlen), %windowbestlen - mov %windowbestlen, _windowbestlen - -/* register ush scan_start = *(ushf*)scan; */ -/* register ush scan_end = *(ushf*)(scan+best_len-1); */ -/* Posf *prev = s->prev; */ - - movzwl (%scan), %scanstart - movzwl -1(%scan, %bestlen), %scanend - mov dsPrev, %prev - -/* Jump into the main loop. */ - - movl %chainlenwmask, _chainlenwmask - jmp LoopEntry - -.balign 16 - -/* do { - * match = s->window + cur_match; - * if (*(ushf*)(match+best_len-1) != scan_end || - * *(ushf*)match != scan_start) continue; - * [...] - * } while ((cur_match = prev[cur_match & wmask]) > limit - * && --chain_length != 0); - * - * Here is the inner loop of the function. The function will spend the - * majority of its time in this loop, and majority of that time will - * be spent in the first ten instructions. - */ -LookupLoop: - andl %chainlenwmask, %curmatchd - movzwl (%prev, %curmatch, 2), %curmatchd - cmpl %limitd, %curmatchd - jbe LeaveNow - subl $0x00010000, %chainlenwmask - js LeaveNow -LoopEntry: cmpw -1(%windowbestlen, %curmatch), %scanendw - jne LookupLoop - cmpw %scanstartw, (%window, %curmatch) - jne LookupLoop - -/* Store the current value of chainlen. */ - movl %chainlenwmask, _chainlenwmask - -/* %scan is the string under scrutiny, and %prev to the string we */ -/* are hoping to match it up with. In actuality, %esi and %edi are */ -/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ -/* initialized to -(MAX_MATCH_8 - scanalign). */ - - mov $(-MAX_MATCH_8), %rdx - lea (%curmatch, %window), %windowbestlen - lea MAX_MATCH_8(%windowbestlen, %scanalign), %windowbestlen - lea MAX_MATCH_8(%scan, %scanalign), %prev - -/* the prefetching below makes very little difference... */ - prefetcht1 (%windowbestlen, %rdx) - prefetcht1 (%prev, %rdx) - -/* - * Test the strings for equality, 8 bytes at a time. At the end, - * adjust %rdx so that it is offset to the exact byte that mismatched. - * - * It should be confessed that this loop usually does not represent - * much of the total running time. Replacing it with a more - * straightforward "rep cmpsb" would not drastically degrade - * performance -- unrolling it, for example, makes no difference. - */ - -#undef USE_SSE /* works, but is 6-7% slower, than non-SSE... */ - -LoopCmps: -#ifdef USE_SSE - /* Preload the SSE registers */ - movdqu (%windowbestlen, %rdx), %xmm1 - movdqu (%prev, %rdx), %xmm2 - pcmpeqb %xmm2, %xmm1 - movdqu 16(%windowbestlen, %rdx), %xmm3 - movdqu 16(%prev, %rdx), %xmm4 - pcmpeqb %xmm4, %xmm3 - movdqu 32(%windowbestlen, %rdx), %xmm5 - movdqu 32(%prev, %rdx), %xmm6 - pcmpeqb %xmm6, %xmm5 - movdqu 48(%windowbestlen, %rdx), %xmm7 - movdqu 48(%prev, %rdx), %xmm8 - pcmpeqb %xmm8, %xmm7 - - /* Check the comparisions' results */ - pmovmskb %xmm1, %rax - notw %ax - bsfw %ax, %ax - jnz LeaveLoopCmps - - /* this is the only iteration of the loop with a possibility of having - incremented rdx by 0x108 (each loop iteration add 16*4 = 0x40 - and (0x40*4)+8=0x108 */ - add $8, %rdx - jz LenMaximum - add $8, %rdx - - - pmovmskb %xmm3, %rax - notw %ax - bsfw %ax, %ax - jnz LeaveLoopCmps - - - add $16, %rdx - - - pmovmskb %xmm5, %rax - notw %ax - bsfw %ax, %ax - jnz LeaveLoopCmps - - add $16, %rdx - - - pmovmskb %xmm7, %rax - notw %ax - bsfw %ax, %ax - jnz LeaveLoopCmps - - add $16, %rdx - - jmp LoopCmps -LeaveLoopCmps: add %rax, %rdx -#else - mov (%windowbestlen, %rdx), %rax - xor (%prev, %rdx), %rax - jnz LeaveLoopCmps - - mov 8(%windowbestlen, %rdx), %rax - xor 8(%prev, %rdx), %rax - jnz LeaveLoopCmps8 - - mov 16(%windowbestlen, %rdx), %rax - xor 16(%prev, %rdx), %rax - jnz LeaveLoopCmps16 - - add $24, %rdx - jnz LoopCmps - jmp LenMaximum -# if 0 -/* - * This three-liner is tantalizingly simple, but bsf is a slow instruction, - * and the complicated alternative down below is quite a bit faster. Sad... - */ - -LeaveLoopCmps: bsf %rax, %rax /* find the first non-zero bit */ - shrl $3, %eax /* divide by 8 to get the byte */ - add %rax, %rdx -# else -LeaveLoopCmps16: - add $8, %rdx -LeaveLoopCmps8: - add $8, %rdx -LeaveLoopCmps: testl $0xFFFFFFFF, %eax /* Check the first 4 bytes */ - jnz Check16 - add $4, %rdx - shr $32, %rax -Check16: testw $0xFFFF, %ax - jnz LenLower - add $2, %rdx - shrl $16, %eax -LenLower: subb $1, %al - adc $0, %rdx -# endif -#endif - -/* Calculate the length of the match. If it is longer than MAX_MATCH, */ -/* then automatically accept it as the best possible match and leave. */ - - lea (%prev, %rdx), %rax - sub %scan, %rax - cmpl $MAX_MATCH, %eax - jge LenMaximum - -/* If the length of the match is not longer than the best match we */ -/* have so far, then forget it and return to the lookup loop. */ - - cmpl %bestlend, %eax - jg LongerMatch - mov _windowbestlen, %windowbestlen - mov dsPrev, %prev - movl _chainlenwmask, %edx - jmp LookupLoop - -/* s->match_start = cur_match; */ -/* best_len = len; */ -/* if (len >= nice_match) break; */ -/* scan_end = *(ushf*)(scan+best_len-1); */ - -LongerMatch: - movl %eax, %bestlend - movl %curmatchd, dsMatchStart - cmpl %nicematch, %eax - jge LeaveNow - - lea (%window, %bestlen), %windowbestlen - mov %windowbestlen, _windowbestlen - - movzwl -1(%scan, %rax), %scanend - mov dsPrev, %prev - movl _chainlenwmask, %chainlenwmask - jmp LookupLoop - -/* Accept the current string, with the maximum possible length. */ - -LenMaximum: - movl $MAX_MATCH, %bestlend - movl %curmatchd, dsMatchStart - -/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ -/* return s->lookahead; */ - -LeaveNow: - movl dsLookahead, %eax - cmpl %eax, %bestlend - cmovngl %bestlend, %eax -LookaheadRet: - -/* Restore the registers and return from whence we came. */ - - mov save_rsi, %rsi - mov save_rbx, %rbx - mov save_r12, %r12 - mov save_r13, %r13 - mov save_r14, %r14 - mov save_r15, %r15 - - ret - -match_init: ret diff --git a/contrib/asm686/README.686 b/contrib/asm686/README.686 deleted file mode 100644 index a0bf3be..0000000 --- a/contrib/asm686/README.686 +++ /dev/null @@ -1,51 +0,0 @@ -This is a patched version of zlib, modified to use -Pentium-Pro-optimized assembly code in the deflation algorithm. The -files changed/added by this patch are: - -README.686 -match.S - -The speedup that this patch provides varies, depending on whether the -compiler used to build the original version of zlib falls afoul of the -PPro's speed traps. My own tests show a speedup of around 10-20% at -the default compression level, and 20-30% using -9, against a version -compiled using gcc 2.7.2.3. Your mileage may vary. - -Note that this code has been tailored for the PPro/PII in particular, -and will not perform particuarly well on a Pentium. - -If you are using an assembler other than GNU as, you will have to -translate match.S to use your assembler's syntax. (Have fun.) - -Brian Raiter -breadbox@muppetlabs.com -April, 1998 - - -Added for zlib 1.1.3: - -The patches come from -http://www.muppetlabs.com/~breadbox/software/assembly.html - -To compile zlib with this asm file, copy match.S to the zlib directory -then do: - -CFLAGS="-O3 -DASMV" ./configure -make OBJA=match.o - - -Update: - -I've been ignoring these assembly routines for years, believing that -gcc's generated code had caught up with it sometime around gcc 2.95 -and the major rearchitecting of the Pentium 4. However, I recently -learned that, despite what I believed, this code still has some life -in it. On the Pentium 4 and AMD64 chips, it continues to run about 8% -faster than the code produced by gcc 4.1. - -In acknowledgement of its continuing usefulness, I've altered the -license to match that of the rest of zlib. Share and Enjoy! - -Brian Raiter -breadbox@muppetlabs.com -April, 2007 diff --git a/contrib/asm686/match.S b/contrib/asm686/match.S deleted file mode 100644 index fa42109..0000000 --- a/contrib/asm686/match.S +++ /dev/null @@ -1,357 +0,0 @@ -/* match.S -- x86 assembly version of the zlib longest_match() function. - * Optimized for the Intel 686 chips (PPro and later). - * - * Copyright (C) 1998, 2007 Brian Raiter - * - * This software is provided 'as-is', without any express or implied - * warranty. In no event will the author be held liable for any damages - * arising from the use of this software. - * - * Permission is granted to anyone to use this software for any purpose, - * including commercial applications, and to alter it and redistribute it - * freely, subject to the following restrictions: - * - * 1. The origin of this software must not be misrepresented; you must not - * claim that you wrote the original software. If you use this software - * in a product, an acknowledgment in the product documentation would be - * appreciated but is not required. - * 2. Altered source versions must be plainly marked as such, and must not be - * misrepresented as being the original software. - * 3. This notice may not be removed or altered from any source distribution. - */ - -#ifndef NO_UNDERLINE -#define match_init _match_init -#define longest_match _longest_match -#endif - -#define MAX_MATCH (258) -#define MIN_MATCH (3) -#define MIN_LOOKAHEAD (MAX_MATCH + MIN_MATCH + 1) -#define MAX_MATCH_8 ((MAX_MATCH + 7) & ~7) - -/* stack frame offsets */ - -#define chainlenwmask 0 /* high word: current chain len */ - /* low word: s->wmask */ -#define window 4 /* local copy of s->window */ -#define windowbestlen 8 /* s->window + bestlen */ -#define scanstart 16 /* first two bytes of string */ -#define scanend 12 /* last two bytes of string */ -#define scanalign 20 /* dword-misalignment of string */ -#define nicematch 24 /* a good enough match size */ -#define bestlen 28 /* size of best match so far */ -#define scan 32 /* ptr to string wanting match */ - -#define LocalVarsSize (36) -/* saved ebx 36 */ -/* saved edi 40 */ -/* saved esi 44 */ -/* saved ebp 48 */ -/* return address 52 */ -#define deflatestate 56 /* the function arguments */ -#define curmatch 60 - -/* All the +zlib1222add offsets are due to the addition of fields - * in zlib in the deflate_state structure since the asm code was first written - * (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). - * (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). - * if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). - */ - -#define zlib1222add (8) - -#define dsWSize (36+zlib1222add) -#define dsWMask (44+zlib1222add) -#define dsWindow (48+zlib1222add) -#define dsPrev (56+zlib1222add) -#define dsMatchLen (88+zlib1222add) -#define dsPrevMatch (92+zlib1222add) -#define dsStrStart (100+zlib1222add) -#define dsMatchStart (104+zlib1222add) -#define dsLookahead (108+zlib1222add) -#define dsPrevLen (112+zlib1222add) -#define dsMaxChainLen (116+zlib1222add) -#define dsGoodMatch (132+zlib1222add) -#define dsNiceMatch (136+zlib1222add) - - -.file "match.S" - -.globl match_init, longest_match - -.text - -/* uInt longest_match(deflate_state *deflatestate, IPos curmatch) */ -.cfi_sections .debug_frame - -longest_match: - -.cfi_startproc -/* Save registers that the compiler may be using, and adjust %esp to */ -/* make room for our stack frame. */ - - pushl %ebp - .cfi_def_cfa_offset 8 - .cfi_offset ebp, -8 - pushl %edi - .cfi_def_cfa_offset 12 - pushl %esi - .cfi_def_cfa_offset 16 - pushl %ebx - .cfi_def_cfa_offset 20 - subl $LocalVarsSize, %esp - .cfi_def_cfa_offset LocalVarsSize+20 - -/* Retrieve the function arguments. %ecx will hold cur_match */ -/* throughout the entire function. %edx will hold the pointer to the */ -/* deflate_state structure during the function's setup (before */ -/* entering the main loop). */ - - movl deflatestate(%esp), %edx - movl curmatch(%esp), %ecx - -/* uInt wmask = s->w_mask; */ -/* unsigned chain_length = s->max_chain_length; */ -/* if (s->prev_length >= s->good_match) { */ -/* chain_length >>= 2; */ -/* } */ - - movl dsPrevLen(%edx), %eax - movl dsGoodMatch(%edx), %ebx - cmpl %ebx, %eax - movl dsWMask(%edx), %eax - movl dsMaxChainLen(%edx), %ebx - jl LastMatchGood - shrl $2, %ebx -LastMatchGood: - -/* chainlen is decremented once beforehand so that the function can */ -/* use the sign flag instead of the zero flag for the exit test. */ -/* It is then shifted into the high word, to make room for the wmask */ -/* value, which it will always accompany. */ - - decl %ebx - shll $16, %ebx - orl %eax, %ebx - movl %ebx, chainlenwmask(%esp) - -/* if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; */ - - movl dsNiceMatch(%edx), %eax - movl dsLookahead(%edx), %ebx - cmpl %eax, %ebx - jl LookaheadLess - movl %eax, %ebx -LookaheadLess: movl %ebx, nicematch(%esp) - -/* register Bytef *scan = s->window + s->strstart; */ - - movl dsWindow(%edx), %esi - movl %esi, window(%esp) - movl dsStrStart(%edx), %ebp - lea (%esi,%ebp), %edi - movl %edi, scan(%esp) - -/* Determine how many bytes the scan ptr is off from being */ -/* dword-aligned. */ - - movl %edi, %eax - negl %eax - andl $3, %eax - movl %eax, scanalign(%esp) - -/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ -/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ - - movl dsWSize(%edx), %eax - subl $MIN_LOOKAHEAD, %eax - subl %eax, %ebp - jg LimitPositive - xorl %ebp, %ebp -LimitPositive: - -/* int best_len = s->prev_length; */ - - movl dsPrevLen(%edx), %eax - movl %eax, bestlen(%esp) - -/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ - - addl %eax, %esi - movl %esi, windowbestlen(%esp) - -/* register ush scan_start = *(ushf*)scan; */ -/* register ush scan_end = *(ushf*)(scan+best_len-1); */ -/* Posf *prev = s->prev; */ - - movzwl (%edi), %ebx - movl %ebx, scanstart(%esp) - movzwl -1(%edi,%eax), %ebx - movl %ebx, scanend(%esp) - movl dsPrev(%edx), %edi - -/* Jump into the main loop. */ - - movl chainlenwmask(%esp), %edx - jmp LoopEntry - -.balign 16 - -/* do { - * match = s->window + cur_match; - * if (*(ushf*)(match+best_len-1) != scan_end || - * *(ushf*)match != scan_start) continue; - * [...] - * } while ((cur_match = prev[cur_match & wmask]) > limit - * && --chain_length != 0); - * - * Here is the inner loop of the function. The function will spend the - * majority of its time in this loop, and majority of that time will - * be spent in the first ten instructions. - * - * Within this loop: - * %ebx = scanend - * %ecx = curmatch - * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) - * %esi = windowbestlen - i.e., (window + bestlen) - * %edi = prev - * %ebp = limit - */ -LookupLoop: - andl %edx, %ecx - movzwl (%edi,%ecx,2), %ecx - cmpl %ebp, %ecx - jbe LeaveNow - subl $0x00010000, %edx - js LeaveNow -LoopEntry: movzwl -1(%esi,%ecx), %eax - cmpl %ebx, %eax - jnz LookupLoop - movl window(%esp), %eax - movzwl (%eax,%ecx), %eax - cmpl scanstart(%esp), %eax - jnz LookupLoop - -/* Store the current value of chainlen. */ - - movl %edx, chainlenwmask(%esp) - -/* Point %edi to the string under scrutiny, and %esi to the string we */ -/* are hoping to match it up with. In actuality, %esi and %edi are */ -/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ -/* initialized to -(MAX_MATCH_8 - scanalign). */ - - movl window(%esp), %esi - movl scan(%esp), %edi - addl %ecx, %esi - movl scanalign(%esp), %eax - movl $(-MAX_MATCH_8), %edx - lea MAX_MATCH_8(%edi,%eax), %edi - lea MAX_MATCH_8(%esi,%eax), %esi - -/* Test the strings for equality, 8 bytes at a time. At the end, - * adjust %edx so that it is offset to the exact byte that mismatched. - * - * We already know at this point that the first three bytes of the - * strings match each other, and they can be safely passed over before - * starting the compare loop. So what this code does is skip over 0-3 - * bytes, as much as necessary in order to dword-align the %edi - * pointer. (%esi will still be misaligned three times out of four.) - * - * It should be confessed that this loop usually does not represent - * much of the total running time. Replacing it with a more - * straightforward "rep cmpsb" would not drastically degrade - * performance. - */ -LoopCmps: - movl (%esi,%edx), %eax - xorl (%edi,%edx), %eax - jnz LeaveLoopCmps - movl 4(%esi,%edx), %eax - xorl 4(%edi,%edx), %eax - jnz LeaveLoopCmps4 - addl $8, %edx - jnz LoopCmps - jmp LenMaximum -LeaveLoopCmps4: addl $4, %edx -LeaveLoopCmps: testl $0x0000FFFF, %eax - jnz LenLower - addl $2, %edx - shrl $16, %eax -LenLower: subb $1, %al - adcl $0, %edx - -/* Calculate the length of the match. If it is longer than MAX_MATCH, */ -/* then automatically accept it as the best possible match and leave. */ - - lea (%edi,%edx), %eax - movl scan(%esp), %edi - subl %edi, %eax - cmpl $MAX_MATCH, %eax - jge LenMaximum - -/* If the length of the match is not longer than the best match we */ -/* have so far, then forget it and return to the lookup loop. */ - - movl deflatestate(%esp), %edx - movl bestlen(%esp), %ebx - cmpl %ebx, %eax - jg LongerMatch - movl windowbestlen(%esp), %esi - movl dsPrev(%edx), %edi - movl scanend(%esp), %ebx - movl chainlenwmask(%esp), %edx - jmp LookupLoop - -/* s->match_start = cur_match; */ -/* best_len = len; */ -/* if (len >= nice_match) break; */ -/* scan_end = *(ushf*)(scan+best_len-1); */ - -LongerMatch: movl nicematch(%esp), %ebx - movl %eax, bestlen(%esp) - movl %ecx, dsMatchStart(%edx) - cmpl %ebx, %eax - jge LeaveNow - movl window(%esp), %esi - addl %eax, %esi - movl %esi, windowbestlen(%esp) - movzwl -1(%edi,%eax), %ebx - movl dsPrev(%edx), %edi - movl %ebx, scanend(%esp) - movl chainlenwmask(%esp), %edx - jmp LookupLoop - -/* Accept the current string, with the maximum possible length. */ - -LenMaximum: movl deflatestate(%esp), %edx - movl $MAX_MATCH, bestlen(%esp) - movl %ecx, dsMatchStart(%edx) - -/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ -/* return s->lookahead; */ - -LeaveNow: - movl deflatestate(%esp), %edx - movl bestlen(%esp), %ebx - movl dsLookahead(%edx), %eax - cmpl %eax, %ebx - jg LookaheadRet - movl %ebx, %eax -LookaheadRet: - -/* Restore the stack and return from whence we came. */ - - addl $LocalVarsSize, %esp - .cfi_def_cfa_offset 20 - popl %ebx - .cfi_def_cfa_offset 16 - popl %esi - .cfi_def_cfa_offset 12 - popl %edi - .cfi_def_cfa_offset 8 - popl %ebp - .cfi_def_cfa_offset 4 -.cfi_endproc -match_init: ret diff --git a/contrib/inflate86/inffas86.c b/contrib/inflate86/inffas86.c deleted file mode 100644 index 7292f67..0000000 --- a/contrib/inflate86/inffas86.c +++ /dev/null @@ -1,1157 +0,0 @@ -/* inffas86.c is a hand tuned assembler version of - * - * inffast.c -- fast decoding - * Copyright (C) 1995-2003 Mark Adler - * For conditions of distribution and use, see copyright notice in zlib.h - * - * Copyright (C) 2003 Chris Anderson - * Please use the copyright conditions above. - * - * Dec-29-2003 -- I added AMD64 inflate asm support. This version is also - * slightly quicker on x86 systems because, instead of using rep movsb to copy - * data, it uses rep movsw, which moves data in 2-byte chunks instead of single - * bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates - * from http://fedora.linux.duke.edu/fc1_x86_64 - * which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with - * 1GB ram. The 64-bit version is about 4% faster than the 32-bit version, - * when decompressing mozilla-source-1.3.tar.gz. - * - * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from - * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at - * the moment. I have successfully compiled and tested this code with gcc2.96, - * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S - * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX - * enabled. I will attempt to merge the MMX code into this version. Newer - * versions of this and inffast.S can be found at - * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/ - */ - -#include "zutil.h" -#include "inftrees.h" -#include "inflate.h" -#include "inffast.h" - -/* Mark Adler's comments from inffast.c: */ - -/* - Decode literal, length, and distance codes and write out the resulting - literal and match bytes until either not enough input or output is - available, an end-of-block is encountered, or a data error is encountered. - When large enough input and output buffers are supplied to inflate(), for - example, a 16K input buffer and a 64K output buffer, more than 95% of the - inflate execution time is spent in this routine. - - Entry assumptions: - - state->mode == LEN - strm->avail_in >= 6 - strm->avail_out >= 258 - start >= strm->avail_out - state->bits < 8 - - On return, state->mode is one of: - - LEN -- ran out of enough output space or enough available input - TYPE -- reached end of block code, inflate() to interpret next block - BAD -- error in block data - - Notes: - - - The maximum input bits used by a length/distance pair is 15 bits for the - length code, 5 bits for the length extra, 15 bits for the distance code, - and 13 bits for the distance extra. This totals 48 bits, or six bytes. - Therefore if strm->avail_in >= 6, then there is enough input to avoid - checking for available input while decoding. - - - The maximum bytes that a single length/distance pair can output is 258 - bytes, which is the maximum length that can be coded. inflate_fast() - requires strm->avail_out >= 258 for each loop to avoid checking for - output space. - */ -void inflate_fast(strm, start) -z_streamp strm; -unsigned start; /* inflate()'s starting value for strm->avail_out */ -{ - struct inflate_state FAR *state; - struct inffast_ar { -/* 64 32 x86 x86_64 */ -/* ar offset register */ -/* 0 0 */ void *esp; /* esp save */ -/* 8 4 */ void *ebp; /* ebp save */ -/* 16 8 */ unsigned char FAR *in; /* esi rsi local strm->next_in */ -/* 24 12 */ unsigned char FAR *last; /* r9 while in < last */ -/* 32 16 */ unsigned char FAR *out; /* edi rdi local strm->next_out */ -/* 40 20 */ unsigned char FAR *beg; /* inflate()'s init next_out */ -/* 48 24 */ unsigned char FAR *end; /* r10 while out < end */ -/* 56 28 */ unsigned char FAR *window;/* size of window, wsize!=0 */ -/* 64 32 */ code const FAR *lcode; /* ebp rbp local strm->lencode */ -/* 72 36 */ code const FAR *dcode; /* r11 local strm->distcode */ -/* 80 40 */ unsigned long hold; /* edx rdx local strm->hold */ -/* 88 44 */ unsigned bits; /* ebx rbx local strm->bits */ -/* 92 48 */ unsigned wsize; /* window size */ -/* 96 52 */ unsigned write; /* window write index */ -/*100 56 */ unsigned lmask; /* r12 mask for lcode */ -/*104 60 */ unsigned dmask; /* r13 mask for dcode */ -/*108 64 */ unsigned len; /* r14 match length */ -/*112 68 */ unsigned dist; /* r15 match distance */ -/*116 72 */ unsigned status; /* set when state chng*/ - } ar; - -#if defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 ) -#define PAD_AVAIL_IN 6 -#define PAD_AVAIL_OUT 258 -#else -#define PAD_AVAIL_IN 5 -#define PAD_AVAIL_OUT 257 -#endif - - /* copy state to local variables */ - state = (struct inflate_state FAR *)strm->state; - ar.in = strm->next_in; - ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN); - ar.out = strm->next_out; - ar.beg = ar.out - (start - strm->avail_out); - ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT); - ar.wsize = state->wsize; - ar.write = state->wnext; - ar.window = state->window; - ar.hold = state->hold; - ar.bits = state->bits; - ar.lcode = state->lencode; - ar.dcode = state->distcode; - ar.lmask = (1U << state->lenbits) - 1; - ar.dmask = (1U << state->distbits) - 1; - - /* decode literals and length/distances until end-of-block or not enough - input data or output space */ - - /* align in on 1/2 hold size boundary */ - while (((unsigned long)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) { - ar.hold += (unsigned long)*ar.in++ << ar.bits; - ar.bits += 8; - } - -#if defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 ) - __asm__ __volatile__ ( -" leaq %0, %%rax\n" -" movq %%rbp, 8(%%rax)\n" /* save regs rbp and rsp */ -" movq %%rsp, (%%rax)\n" -" movq %%rax, %%rsp\n" /* make rsp point to &ar */ -" movq 16(%%rsp), %%rsi\n" /* rsi = in */ -" movq 32(%%rsp), %%rdi\n" /* rdi = out */ -" movq 24(%%rsp), %%r9\n" /* r9 = last */ -" movq 48(%%rsp), %%r10\n" /* r10 = end */ -" movq 64(%%rsp), %%rbp\n" /* rbp = lcode */ -" movq 72(%%rsp), %%r11\n" /* r11 = dcode */ -" movq 80(%%rsp), %%rdx\n" /* rdx = hold */ -" movl 88(%%rsp), %%ebx\n" /* ebx = bits */ -" movl 100(%%rsp), %%r12d\n" /* r12d = lmask */ -" movl 104(%%rsp), %%r13d\n" /* r13d = dmask */ - /* r14d = len */ - /* r15d = dist */ -" cld\n" -" cmpq %%rdi, %%r10\n" -" je .L_one_time\n" /* if only one decode left */ -" cmpq %%rsi, %%r9\n" -" je .L_one_time\n" -" jmp .L_do_loop\n" - -".L_one_time:\n" -" movq %%r12, %%r8\n" /* r8 = lmask */ -" cmpb $32, %%bl\n" -" ja .L_get_length_code_one_time\n" - -" lodsl\n" /* eax = *(uint *)in++ */ -" movb %%bl, %%cl\n" /* cl = bits, needs it for shifting */ -" addb $32, %%bl\n" /* bits += 32 */ -" shlq %%cl, %%rax\n" -" orq %%rax, %%rdx\n" /* hold |= *((uint *)in)++ << bits */ -" jmp .L_get_length_code_one_time\n" - -".align 32,0x90\n" -".L_while_test:\n" -" cmpq %%rdi, %%r10\n" -" jbe .L_break_loop\n" -" cmpq %%rsi, %%r9\n" -" jbe .L_break_loop\n" - -".L_do_loop:\n" -" movq %%r12, %%r8\n" /* r8 = lmask */ -" cmpb $32, %%bl\n" -" ja .L_get_length_code\n" /* if (32 < bits) */ - -" lodsl\n" /* eax = *(uint *)in++ */ -" movb %%bl, %%cl\n" /* cl = bits, needs it for shifting */ -" addb $32, %%bl\n" /* bits += 32 */ -" shlq %%cl, %%rax\n" -" orq %%rax, %%rdx\n" /* hold |= *((uint *)in)++ << bits */ - -".L_get_length_code:\n" -" andq %%rdx, %%r8\n" /* r8 &= hold */ -" movl (%%rbp,%%r8,4), %%eax\n" /* eax = lcode[hold & lmask] */ - -" movb %%ah, %%cl\n" /* cl = this.bits */ -" subb %%ah, %%bl\n" /* bits -= this.bits */ -" shrq %%cl, %%rdx\n" /* hold >>= this.bits */ - -" testb %%al, %%al\n" -" jnz .L_test_for_length_base\n" /* if (op != 0) 45.7% */ - -" movq %%r12, %%r8\n" /* r8 = lmask */ -" shrl $16, %%eax\n" /* output this.val char */ -" stosb\n" - -".L_get_length_code_one_time:\n" -" andq %%rdx, %%r8\n" /* r8 &= hold */ -" movl (%%rbp,%%r8,4), %%eax\n" /* eax = lcode[hold & lmask] */ - -".L_dolen:\n" -" movb %%ah, %%cl\n" /* cl = this.bits */ -" subb %%ah, %%bl\n" /* bits -= this.bits */ -" shrq %%cl, %%rdx\n" /* hold >>= this.bits */ - -" testb %%al, %%al\n" -" jnz .L_test_for_length_base\n" /* if (op != 0) 45.7% */ - -" shrl $16, %%eax\n" /* output this.val char */ -" stosb\n" -" jmp .L_while_test\n" - -".align 32,0x90\n" -".L_test_for_length_base:\n" -" movl %%eax, %%r14d\n" /* len = this */ -" shrl $16, %%r14d\n" /* len = this.val */ -" movb %%al, %%cl\n" - -" testb $16, %%al\n" -" jz .L_test_for_second_level_length\n" /* if ((op & 16) == 0) 8% */ -" andb $15, %%cl\n" /* op &= 15 */ -" jz .L_decode_distance\n" /* if (!op) */ - -".L_add_bits_to_len:\n" -" subb %%cl, %%bl\n" -" xorl %%eax, %%eax\n" -" incl %%eax\n" -" shll %%cl, %%eax\n" -" decl %%eax\n" -" andl %%edx, %%eax\n" /* eax &= hold */ -" shrq %%cl, %%rdx\n" -" addl %%eax, %%r14d\n" /* len += hold & mask[op] */ - -".L_decode_distance:\n" -" movq %%r13, %%r8\n" /* r8 = dmask */ -" cmpb $32, %%bl\n" -" ja .L_get_distance_code\n" /* if (32 < bits) */ - -" lodsl\n" /* eax = *(uint *)in++ */ -" movb %%bl, %%cl\n" /* cl = bits, needs it for shifting */ -" addb $32, %%bl\n" /* bits += 32 */ -" shlq %%cl, %%rax\n" -" orq %%rax, %%rdx\n" /* hold |= *((uint *)in)++ << bits */ - -".L_get_distance_code:\n" -" andq %%rdx, %%r8\n" /* r8 &= hold */ -" movl (%%r11,%%r8,4), %%eax\n" /* eax = dcode[hold & dmask] */ - -".L_dodist:\n" -" movl %%eax, %%r15d\n" /* dist = this */ -" shrl $16, %%r15d\n" /* dist = this.val */ -" movb %%ah, %%cl\n" -" subb %%ah, %%bl\n" /* bits -= this.bits */ -" shrq %%cl, %%rdx\n" /* hold >>= this.bits */ -" movb %%al, %%cl\n" /* cl = this.op */ - -" testb $16, %%al\n" /* if ((op & 16) == 0) */ -" jz .L_test_for_second_level_dist\n" -" andb $15, %%cl\n" /* op &= 15 */ -" jz .L_check_dist_one\n" - -".L_add_bits_to_dist:\n" -" subb %%cl, %%bl\n" -" xorl %%eax, %%eax\n" -" incl %%eax\n" -" shll %%cl, %%eax\n" -" decl %%eax\n" /* (1 << op) - 1 */ -" andl %%edx, %%eax\n" /* eax &= hold */ -" shrq %%cl, %%rdx\n" -" addl %%eax, %%r15d\n" /* dist += hold & ((1 << op) - 1) */ - -".L_check_window:\n" -" movq %%rsi, %%r8\n" /* save in so from can use it's reg */ -" movq %%rdi, %%rax\n" -" subq 40(%%rsp), %%rax\n" /* nbytes = out - beg */ - -" cmpl %%r15d, %%eax\n" -" jb .L_clip_window\n" /* if (dist > nbytes) 4.2% */ - -" movl %%r14d, %%ecx\n" /* ecx = len */ -" movq %%rdi, %%rsi\n" -" subq %%r15, %%rsi\n" /* from = out - dist */ - -" sarl %%ecx\n" -" jnc .L_copy_two\n" /* if len % 2 == 0 */ - -" rep movsw\n" -" movb (%%rsi), %%al\n" -" movb %%al, (%%rdi)\n" -" incq %%rdi\n" - -" movq %%r8, %%rsi\n" /* move in back to %rsi, toss from */ -" jmp .L_while_test\n" - -".L_copy_two:\n" -" rep movsw\n" -" movq %%r8, %%rsi\n" /* move in back to %rsi, toss from */ -" jmp .L_while_test\n" - -".align 32,0x90\n" -".L_check_dist_one:\n" -" cmpl $1, %%r15d\n" /* if dist 1, is a memset */ -" jne .L_check_window\n" -" cmpq %%rdi, 40(%%rsp)\n" /* if out == beg, outside window */ -" je .L_check_window\n" - -" movl %%r14d, %%ecx\n" /* ecx = len */ -" movb -1(%%rdi), %%al\n" -" movb %%al, %%ah\n" - -" sarl %%ecx\n" -" jnc .L_set_two\n" -" movb %%al, (%%rdi)\n" -" incq %%rdi\n" - -".L_set_two:\n" -" rep stosw\n" -" jmp .L_while_test\n" - -".align 32,0x90\n" -".L_test_for_second_level_length:\n" -" testb $64, %%al\n" -" jnz .L_test_for_end_of_block\n" /* if ((op & 64) != 0) */ - -" xorl %%eax, %%eax\n" -" incl %%eax\n" -" shll %%cl, %%eax\n" -" decl %%eax\n" -" andl %%edx, %%eax\n" /* eax &= hold */ -" addl %%r14d, %%eax\n" /* eax += len */ -" movl (%%rbp,%%rax,4), %%eax\n" /* eax = lcode[val+(hold&mask[op])]*/ -" jmp .L_dolen\n" - -".align 32,0x90\n" -".L_test_for_second_level_dist:\n" -" testb $64, %%al\n" -" jnz .L_invalid_distance_code\n" /* if ((op & 64) != 0) */ - -" xorl %%eax, %%eax\n" -" incl %%eax\n" -" shll %%cl, %%eax\n" -" decl %%eax\n" -" andl %%edx, %%eax\n" /* eax &= hold */ -" addl %%r15d, %%eax\n" /* eax += dist */ -" movl (%%r11,%%rax,4), %%eax\n" /* eax = dcode[val+(hold&mask[op])]*/ -" jmp .L_dodist\n" - -".align 32,0x90\n" -".L_clip_window:\n" -" movl %%eax, %%ecx\n" /* ecx = nbytes */ -" movl 92(%%rsp), %%eax\n" /* eax = wsize, prepare for dist cmp */ -" negl %%ecx\n" /* nbytes = -nbytes */ - -" cmpl %%r15d, %%eax\n" -" jb .L_invalid_distance_too_far\n" /* if (dist > wsize) */ - -" addl %%r15d, %%ecx\n" /* nbytes = dist - nbytes */ -" cmpl $0, 96(%%rsp)\n" -" jne .L_wrap_around_window\n" /* if (write != 0) */ - -" movq 56(%%rsp), %%rsi\n" /* from = window */ -" subl %%ecx, %%eax\n" /* eax -= nbytes */ -" addq %%rax, %%rsi\n" /* from += wsize - nbytes */ - -" movl %%r14d, %%eax\n" /* eax = len */ -" cmpl %%ecx, %%r14d\n" -" jbe .L_do_copy\n" /* if (nbytes >= len) */ - -" subl %%ecx, %%eax\n" /* eax -= nbytes */ -" rep movsb\n" -" movq %%rdi, %%rsi\n" -" subq %%r15, %%rsi\n" /* from = &out[ -dist ] */ -" jmp .L_do_copy\n" - -".align 32,0x90\n" -".L_wrap_around_window:\n" -" movl 96(%%rsp), %%eax\n" /* eax = write */ -" cmpl %%eax, %%ecx\n" -" jbe .L_contiguous_in_window\n" /* if (write >= nbytes) */ - -" movl 92(%%rsp), %%esi\n" /* from = wsize */ -" addq 56(%%rsp), %%rsi\n" /* from += window */ -" addq %%rax, %%rsi\n" /* from += write */ -" subq %%rcx, %%rsi\n" /* from -= nbytes */ -" subl %%eax, %%ecx\n" /* nbytes -= write */ - -" movl %%r14d, %%eax\n" /* eax = len */ -" cmpl %%ecx, %%eax\n" -" jbe .L_do_copy\n" /* if (nbytes >= len) */ - -" subl %%ecx, %%eax\n" /* len -= nbytes */ -" rep movsb\n" -" movq 56(%%rsp), %%rsi\n" /* from = window */ -" movl 96(%%rsp), %%ecx\n" /* nbytes = write */ -" cmpl %%ecx, %%eax\n" -" jbe .L_do_copy\n" /* if (nbytes >= len) */ - -" subl %%ecx, %%eax\n" /* len -= nbytes */ -" rep movsb\n" -" movq %%rdi, %%rsi\n" -" subq %%r15, %%rsi\n" /* from = out - dist */ -" jmp .L_do_copy\n" - -".align 32,0x90\n" -".L_contiguous_in_window:\n" -" movq 56(%%rsp), %%rsi\n" /* rsi = window */ -" addq %%rax, %%rsi\n" -" subq %%rcx, %%rsi\n" /* from += write - nbytes */ - -" movl %%r14d, %%eax\n" /* eax = len */ -" cmpl %%ecx, %%eax\n" -" jbe .L_do_copy\n" /* if (nbytes >= len) */ - -" subl %%ecx, %%eax\n" /* len -= nbytes */ -" rep movsb\n" -" movq %%rdi, %%rsi\n" -" subq %%r15, %%rsi\n" /* from = out - dist */ -" jmp .L_do_copy\n" /* if (nbytes >= len) */ - -".align 32,0x90\n" -".L_do_copy:\n" -" movl %%eax, %%ecx\n" /* ecx = len */ -" rep movsb\n" - -" movq %%r8, %%rsi\n" /* move in back to %esi, toss from */ -" jmp .L_while_test\n" - -".L_test_for_end_of_block:\n" -" testb $32, %%al\n" -" jz .L_invalid_literal_length_code\n" -" movl $1, 116(%%rsp)\n" -" jmp .L_break_loop_with_status\n" - -".L_invalid_literal_length_code:\n" -" movl $2, 116(%%rsp)\n" -" jmp .L_break_loop_with_status\n" - -".L_invalid_distance_code:\n" -" movl $3, 116(%%rsp)\n" -" jmp .L_break_loop_with_status\n" - -".L_invalid_distance_too_far:\n" -" movl $4, 116(%%rsp)\n" -" jmp .L_break_loop_with_status\n" - -".L_break_loop:\n" -" movl $0, 116(%%rsp)\n" - -".L_break_loop_with_status:\n" -/* put in, out, bits, and hold back into ar and pop esp */ -" movq %%rsi, 16(%%rsp)\n" /* in */ -" movq %%rdi, 32(%%rsp)\n" /* out */ -" movl %%ebx, 88(%%rsp)\n" /* bits */ -" movq %%rdx, 80(%%rsp)\n" /* hold */ -" movq (%%rsp), %%rax\n" /* restore rbp and rsp */ -" movq 8(%%rsp), %%rbp\n" -" movq %%rax, %%rsp\n" - : - : "m" (ar) - : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", - "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" - ); -#elif ( defined( __GNUC__ ) || defined( __ICC ) ) && defined( __i386 ) - __asm__ __volatile__ ( -" leal %0, %%eax\n" -" movl %%esp, (%%eax)\n" /* save esp, ebp */ -" movl %%ebp, 4(%%eax)\n" -" movl %%eax, %%esp\n" -" movl 8(%%esp), %%esi\n" /* esi = in */ -" movl 16(%%esp), %%edi\n" /* edi = out */ -" movl 40(%%esp), %%edx\n" /* edx = hold */ -" movl 44(%%esp), %%ebx\n" /* ebx = bits */ -" movl 32(%%esp), %%ebp\n" /* ebp = lcode */ - -" cld\n" -" jmp .L_do_loop\n" - -".align 32,0x90\n" -".L_while_test:\n" -" cmpl %%edi, 24(%%esp)\n" /* out < end */ -" jbe .L_break_loop\n" -" cmpl %%esi, 12(%%esp)\n" /* in < last */ -" jbe .L_break_loop\n" - -".L_do_loop:\n" -" cmpb $15, %%bl\n" -" ja .L_get_length_code\n" /* if (15 < bits) */ - -" xorl %%eax, %%eax\n" -" lodsw\n" /* al = *(ushort *)in++ */ -" movb %%bl, %%cl\n" /* cl = bits, needs it for shifting */ -" addb $16, %%bl\n" /* bits += 16 */ -" shll %%cl, %%eax\n" -" orl %%eax, %%edx\n" /* hold |= *((ushort *)in)++ << bits */ - -".L_get_length_code:\n" -" movl 56(%%esp), %%eax\n" /* eax = lmask */ -" andl %%edx, %%eax\n" /* eax &= hold */ -" movl (%%ebp,%%eax,4), %%eax\n" /* eax = lcode[hold & lmask] */ - -".L_dolen:\n" -" movb %%ah, %%cl\n" /* cl = this.bits */ -" subb %%ah, %%bl\n" /* bits -= this.bits */ -" shrl %%cl, %%edx\n" /* hold >>= this.bits */ - -" testb %%al, %%al\n" -" jnz .L_test_for_length_base\n" /* if (op != 0) 45.7% */ - -" shrl $16, %%eax\n" /* output this.val char */ -" stosb\n" -" jmp .L_while_test\n" - -".align 32,0x90\n" -".L_test_for_length_base:\n" -" movl %%eax, %%ecx\n" /* len = this */ -" shrl $16, %%ecx\n" /* len = this.val */ -" movl %%ecx, 64(%%esp)\n" /* save len */ -" movb %%al, %%cl\n" - -" testb $16, %%al\n" -" jz .L_test_for_second_level_length\n" /* if ((op & 16) == 0) 8% */ -" andb $15, %%cl\n" /* op &= 15 */ -" jz .L_decode_distance\n" /* if (!op) */ -" cmpb %%cl, %%bl\n" -" jae .L_add_bits_to_len\n" /* if (op <= bits) */ - -" movb %%cl, %%ch\n" /* stash op in ch, freeing cl */ -" xorl %%eax, %%eax\n" -" lodsw\n" /* al = *(ushort *)in++ */ -" movb %%bl, %%cl\n" /* cl = bits, needs it for shifting */ -" addb $16, %%bl\n" /* bits += 16 */ -" shll %%cl, %%eax\n" -" orl %%eax, %%edx\n" /* hold |= *((ushort *)in)++ << bits */ -" movb %%ch, %%cl\n" /* move op back to ecx */ - -".L_add_bits_to_len:\n" -" subb %%cl, %%bl\n" -" xorl %%eax, %%eax\n" -" incl %%eax\n" -" shll %%cl, %%eax\n" -" decl %%eax\n" -" andl %%edx, %%eax\n" /* eax &= hold */ -" shrl %%cl, %%edx\n" -" addl %%eax, 64(%%esp)\n" /* len += hold & mask[op] */ - -".L_decode_distance:\n" -" cmpb $15, %%bl\n" -" ja .L_get_distance_code\n" /* if (15 < bits) */ - -" xorl %%eax, %%eax\n" -" lodsw\n" /* al = *(ushort *)in++ */ -" movb %%bl, %%cl\n" /* cl = bits, needs it for shifting */ -" addb $16, %%bl\n" /* bits += 16 */ -" shll %%cl, %%eax\n" -" orl %%eax, %%edx\n" /* hold |= *((ushort *)in)++ << bits */ - -".L_get_distance_code:\n" -" movl 60(%%esp), %%eax\n" /* eax = dmask */ -" movl 36(%%esp), %%ecx\n" /* ecx = dcode */ -" andl %%edx, %%eax\n" /* eax &= hold */ -" movl (%%ecx,%%eax,4), %%eax\n"/* eax = dcode[hold & dmask] */ - -".L_dodist:\n" -" movl %%eax, %%ebp\n" /* dist = this */ -" shrl $16, %%ebp\n" /* dist = this.val */ -" movb %%ah, %%cl\n" -" subb %%ah, %%bl\n" /* bits -= this.bits */ -" shrl %%cl, %%edx\n" /* hold >>= this.bits */ -" movb %%al, %%cl\n" /* cl = this.op */ - -" testb $16, %%al\n" /* if ((op & 16) == 0) */ -" jz .L_test_for_second_level_dist\n" -" andb $15, %%cl\n" /* op &= 15 */ -" jz .L_check_dist_one\n" -" cmpb %%cl, %%bl\n" -" jae .L_add_bits_to_dist\n" /* if (op <= bits) 97.6% */ - -" movb %%cl, %%ch\n" /* stash op in ch, freeing cl */ -" xorl %%eax, %%eax\n" -" lodsw\n" /* al = *(ushort *)in++ */ -" movb %%bl, %%cl\n" /* cl = bits, needs it for shifting */ -" addb $16, %%bl\n" /* bits += 16 */ -" shll %%cl, %%eax\n" -" orl %%eax, %%edx\n" /* hold |= *((ushort *)in)++ << bits */ -" movb %%ch, %%cl\n" /* move op back to ecx */ - -".L_add_bits_to_dist:\n" -" subb %%cl, %%bl\n" -" xorl %%eax, %%eax\n" -" incl %%eax\n" -" shll %%cl, %%eax\n" -" decl %%eax\n" /* (1 << op) - 1 */ -" andl %%edx, %%eax\n" /* eax &= hold */ -" shrl %%cl, %%edx\n" -" addl %%eax, %%ebp\n" /* dist += hold & ((1 << op) - 1) */ - -".L_check_window:\n" -" movl %%esi, 8(%%esp)\n" /* save in so from can use it's reg */ -" movl %%edi, %%eax\n" -" subl 20(%%esp), %%eax\n" /* nbytes = out - beg */ - -" cmpl %%ebp, %%eax\n" -" jb .L_clip_window\n" /* if (dist > nbytes) 4.2% */ - -" movl 64(%%esp), %%ecx\n" /* ecx = len */ -" movl %%edi, %%esi\n" -" subl %%ebp, %%esi\n" /* from = out - dist */ - -" sarl %%ecx\n" -" jnc .L_copy_two\n" /* if len % 2 == 0 */ - -" rep movsw\n" -" movb (%%esi), %%al\n" -" movb %%al, (%%edi)\n" -" incl %%edi\n" - -" movl 8(%%esp), %%esi\n" /* move in back to %esi, toss from */ -" movl 32(%%esp), %%ebp\n" /* ebp = lcode */ -" jmp .L_while_test\n" - -".L_copy_two:\n" -" rep movsw\n" -" movl 8(%%esp), %%esi\n" /* move in back to %esi, toss from */ -" movl 32(%%esp), %%ebp\n" /* ebp = lcode */ -" jmp .L_while_test\n" - -".align 32,0x90\n" -".L_check_dist_one:\n" -" cmpl $1, %%ebp\n" /* if dist 1, is a memset */ -" jne .L_check_window\n" -" cmpl %%edi, 20(%%esp)\n" -" je .L_check_window\n" /* out == beg, if outside window */ - -" movl 64(%%esp), %%ecx\n" /* ecx = len */ -" movb -1(%%edi), %%al\n" -" movb %%al, %%ah\n" - -" sarl %%ecx\n" -" jnc .L_set_two\n" -" movb %%al, (%%edi)\n" -" incl %%edi\n" - -".L_set_two:\n" -" rep stosw\n" -" movl 32(%%esp), %%ebp\n" /* ebp = lcode */ -" jmp .L_while_test\n" - -".align 32,0x90\n" -".L_test_for_second_level_length:\n" -" testb $64, %%al\n" -" jnz .L_test_for_end_of_block\n" /* if ((op & 64) != 0) */ - -" xorl %%eax, %%eax\n" -" incl %%eax\n" -" shll %%cl, %%eax\n" -" decl %%eax\n" -" andl %%edx, %%eax\n" /* eax &= hold */ -" addl 64(%%esp), %%eax\n" /* eax += len */ -" movl (%%ebp,%%eax,4), %%eax\n" /* eax = lcode[val+(hold&mask[op])]*/ -" jmp .L_dolen\n" - -".align 32,0x90\n" -".L_test_for_second_level_dist:\n" -" testb $64, %%al\n" -" jnz .L_invalid_distance_code\n" /* if ((op & 64) != 0) */ - -" xorl %%eax, %%eax\n" -" incl %%eax\n" -" shll %%cl, %%eax\n" -" decl %%eax\n" -" andl %%edx, %%eax\n" /* eax &= hold */ -" addl %%ebp, %%eax\n" /* eax += dist */ -" movl 36(%%esp), %%ecx\n" /* ecx = dcode */ -" movl (%%ecx,%%eax,4), %%eax\n" /* eax = dcode[val+(hold&mask[op])]*/ -" jmp .L_dodist\n" - -".align 32,0x90\n" -".L_clip_window:\n" -" movl %%eax, %%ecx\n" -" movl 48(%%esp), %%eax\n" /* eax = wsize */ -" negl %%ecx\n" /* nbytes = -nbytes */ -" movl 28(%%esp), %%esi\n" /* from = window */ - -" cmpl %%ebp, %%eax\n" -" jb .L_invalid_distance_too_far\n" /* if (dist > wsize) */ - -" addl %%ebp, %%ecx\n" /* nbytes = dist - nbytes */ -" cmpl $0, 52(%%esp)\n" -" jne .L_wrap_around_window\n" /* if (write != 0) */ - -" subl %%ecx, %%eax\n" -" addl %%eax, %%esi\n" /* from += wsize - nbytes */ - -" movl 64(%%esp), %%eax\n" /* eax = len */ -" cmpl %%ecx, %%eax\n" -" jbe .L_do_copy\n" /* if (nbytes >= len) */ - -" subl %%ecx, %%eax\n" /* len -= nbytes */ -" rep movsb\n" -" movl %%edi, %%esi\n" -" subl %%ebp, %%esi\n" /* from = out - dist */ -" jmp .L_do_copy\n" - -".align 32,0x90\n" -".L_wrap_around_window:\n" -" movl 52(%%esp), %%eax\n" /* eax = write */ -" cmpl %%eax, %%ecx\n" -" jbe .L_contiguous_in_window\n" /* if (write >= nbytes) */ - -" addl 48(%%esp), %%esi\n" /* from += wsize */ -" addl %%eax, %%esi\n" /* from += write */ -" subl %%ecx, %%esi\n" /* from -= nbytes */ -" subl %%eax, %%ecx\n" /* nbytes -= write */ - -" movl 64(%%esp), %%eax\n" /* eax = len */ -" cmpl %%ecx, %%eax\n" -" jbe .L_do_copy\n" /* if (nbytes >= len) */ - -" subl %%ecx, %%eax\n" /* len -= nbytes */ -" rep movsb\n" -" movl 28(%%esp), %%esi\n" /* from = window */ -" movl 52(%%esp), %%ecx\n" /* nbytes = write */ -" cmpl %%ecx, %%eax\n" -" jbe .L_do_copy\n" /* if (nbytes >= len) */ - -" subl %%ecx, %%eax\n" /* len -= nbytes */ -" rep movsb\n" -" movl %%edi, %%esi\n" -" subl %%ebp, %%esi\n" /* from = out - dist */ -" jmp .L_do_copy\n" - -".align 32,0x90\n" -".L_contiguous_in_window:\n" -" addl %%eax, %%esi\n" -" subl %%ecx, %%esi\n" /* from += write - nbytes */ - -" movl 64(%%esp), %%eax\n" /* eax = len */ -" cmpl %%ecx, %%eax\n" -" jbe .L_do_copy\n" /* if (nbytes >= len) */ - -" subl %%ecx, %%eax\n" /* len -= nbytes */ -" rep movsb\n" -" movl %%edi, %%esi\n" -" subl %%ebp, %%esi\n" /* from = out - dist */ -" jmp .L_do_copy\n" /* if (nbytes >= len) */ - -".align 32,0x90\n" -".L_do_copy:\n" -" movl %%eax, %%ecx\n" -" rep movsb\n" - -" movl 8(%%esp), %%esi\n" /* move in back to %esi, toss from */ -" movl 32(%%esp), %%ebp\n" /* ebp = lcode */ -" jmp .L_while_test\n" - -".L_test_for_end_of_block:\n" -" testb $32, %%al\n" -" jz .L_invalid_literal_length_code\n" -" movl $1, 72(%%esp)\n" -" jmp .L_break_loop_with_status\n" - -".L_invalid_literal_length_code:\n" -" movl $2, 72(%%esp)\n" -" jmp .L_break_loop_with_status\n" - -".L_invalid_distance_code:\n" -" movl $3, 72(%%esp)\n" -" jmp .L_break_loop_with_status\n" - -".L_invalid_distance_too_far:\n" -" movl 8(%%esp), %%esi\n" -" movl $4, 72(%%esp)\n" -" jmp .L_break_loop_with_status\n" - -".L_break_loop:\n" -" movl $0, 72(%%esp)\n" - -".L_break_loop_with_status:\n" -/* put in, out, bits, and hold back into ar and pop esp */ -" movl %%esi, 8(%%esp)\n" /* save in */ -" movl %%edi, 16(%%esp)\n" /* save out */ -" movl %%ebx, 44(%%esp)\n" /* save bits */ -" movl %%edx, 40(%%esp)\n" /* save hold */ -" movl 4(%%esp), %%ebp\n" /* restore esp, ebp */ -" movl (%%esp), %%esp\n" - : - : "m" (ar) - : "memory", "%eax", "%ebx", "%ecx", "%edx", "%esi", "%edi" - ); -#elif defined( _MSC_VER ) && ! defined( _M_AMD64 ) - __asm { - lea eax, ar - mov [eax], esp /* save esp, ebp */ - mov [eax+4], ebp - mov esp, eax - mov esi, [esp+8] /* esi = in */ - mov edi, [esp+16] /* edi = out */ - mov edx, [esp+40] /* edx = hold */ - mov ebx, [esp+44] /* ebx = bits */ - mov ebp, [esp+32] /* ebp = lcode */ - - cld - jmp L_do_loop - -ALIGN 4 -L_while_test: - cmp [esp+24], edi - jbe L_break_loop - cmp [esp+12], esi - jbe L_break_loop - -L_do_loop: - cmp bl, 15 - ja L_get_length_code /* if (15 < bits) */ - - xor eax, eax - lodsw /* al = *(ushort *)in++ */ - mov cl, bl /* cl = bits, needs it for shifting */ - add bl, 16 /* bits += 16 */ - shl eax, cl - or edx, eax /* hold |= *((ushort *)in)++ << bits */ - -L_get_length_code: - mov eax, [esp+56] /* eax = lmask */ - and eax, edx /* eax &= hold */ - mov eax, [ebp+eax*4] /* eax = lcode[hold & lmask] */ - -L_dolen: - mov cl, ah /* cl = this.bits */ - sub bl, ah /* bits -= this.bits */ - shr edx, cl /* hold >>= this.bits */ - - test al, al - jnz L_test_for_length_base /* if (op != 0) 45.7% */ - - shr eax, 16 /* output this.val char */ - stosb - jmp L_while_test - -ALIGN 4 -L_test_for_length_base: - mov ecx, eax /* len = this */ - shr ecx, 16 /* len = this.val */ - mov [esp+64], ecx /* save len */ - mov cl, al - - test al, 16 - jz L_test_for_second_level_length /* if ((op & 16) == 0) 8% */ - and cl, 15 /* op &= 15 */ - jz L_decode_distance /* if (!op) */ - cmp bl, cl - jae L_add_bits_to_len /* if (op <= bits) */ - - mov ch, cl /* stash op in ch, freeing cl */ - xor eax, eax - lodsw /* al = *(ushort *)in++ */ - mov cl, bl /* cl = bits, needs it for shifting */ - add bl, 16 /* bits += 16 */ - shl eax, cl - or edx, eax /* hold |= *((ushort *)in)++ << bits */ - mov cl, ch /* move op back to ecx */ - -L_add_bits_to_len: - sub bl, cl - xor eax, eax - inc eax - shl eax, cl - dec eax - and eax, edx /* eax &= hold */ - shr edx, cl - add [esp+64], eax /* len += hold & mask[op] */ - -L_decode_distance: - cmp bl, 15 - ja L_get_distance_code /* if (15 < bits) */ - - xor eax, eax - lodsw /* al = *(ushort *)in++ */ - mov cl, bl /* cl = bits, needs it for shifting */ - add bl, 16 /* bits += 16 */ - shl eax, cl - or edx, eax /* hold |= *((ushort *)in)++ << bits */ - -L_get_distance_code: - mov eax, [esp+60] /* eax = dmask */ - mov ecx, [esp+36] /* ecx = dcode */ - and eax, edx /* eax &= hold */ - mov eax, [ecx+eax*4]/* eax = dcode[hold & dmask] */ - -L_dodist: - mov ebp, eax /* dist = this */ - shr ebp, 16 /* dist = this.val */ - mov cl, ah - sub bl, ah /* bits -= this.bits */ - shr edx, cl /* hold >>= this.bits */ - mov cl, al /* cl = this.op */ - - test al, 16 /* if ((op & 16) == 0) */ - jz L_test_for_second_level_dist - and cl, 15 /* op &= 15 */ - jz L_check_dist_one - cmp bl, cl - jae L_add_bits_to_dist /* if (op <= bits) 97.6% */ - - mov ch, cl /* stash op in ch, freeing cl */ - xor eax, eax - lodsw /* al = *(ushort *)in++ */ - mov cl, bl /* cl = bits, needs it for shifting */ - add bl, 16 /* bits += 16 */ - shl eax, cl - or edx, eax /* hold |= *((ushort *)in)++ << bits */ - mov cl, ch /* move op back to ecx */ - -L_add_bits_to_dist: - sub bl, cl - xor eax, eax - inc eax - shl eax, cl - dec eax /* (1 << op) - 1 */ - and eax, edx /* eax &= hold */ - shr edx, cl - add ebp, eax /* dist += hold & ((1 << op) - 1) */ - -L_check_window: - mov [esp+8], esi /* save in so from can use it's reg */ - mov eax, edi - sub eax, [esp+20] /* nbytes = out - beg */ - - cmp eax, ebp - jb L_clip_window /* if (dist > nbytes) 4.2% */ - - mov ecx, [esp+64] /* ecx = len */ - mov esi, edi - sub esi, ebp /* from = out - dist */ - - sar ecx, 1 - jnc L_copy_two - - rep movsw - mov al, [esi] - mov [edi], al - inc edi - - mov esi, [esp+8] /* move in back to %esi, toss from */ - mov ebp, [esp+32] /* ebp = lcode */ - jmp L_while_test - -L_copy_two: - rep movsw - mov esi, [esp+8] /* move in back to %esi, toss from */ - mov ebp, [esp+32] /* ebp = lcode */ - jmp L_while_test - -ALIGN 4 -L_check_dist_one: - cmp ebp, 1 /* if dist 1, is a memset */ - jne L_check_window - cmp [esp+20], edi - je L_check_window /* out == beg, if outside window */ - - mov ecx, [esp+64] /* ecx = len */ - mov al, [edi-1] - mov ah, al - - sar ecx, 1 - jnc L_set_two - mov [edi], al /* memset out with from[-1] */ - inc edi - -L_set_two: - rep stosw - mov ebp, [esp+32] /* ebp = lcode */ - jmp L_while_test - -ALIGN 4 -L_test_for_second_level_length: - test al, 64 - jnz L_test_for_end_of_block /* if ((op & 64) != 0) */ - - xor eax, eax - inc eax - shl eax, cl - dec eax - and eax, edx /* eax &= hold */ - add eax, [esp+64] /* eax += len */ - mov eax, [ebp+eax*4] /* eax = lcode[val+(hold&mask[op])]*/ - jmp L_dolen - -ALIGN 4 -L_test_for_second_level_dist: - test al, 64 - jnz L_invalid_distance_code /* if ((op & 64) != 0) */ - - xor eax, eax - inc eax - shl eax, cl - dec eax - and eax, edx /* eax &= hold */ - add eax, ebp /* eax += dist */ - mov ecx, [esp+36] /* ecx = dcode */ - mov eax, [ecx+eax*4] /* eax = dcode[val+(hold&mask[op])]*/ - jmp L_dodist - -ALIGN 4 -L_clip_window: - mov ecx, eax - mov eax, [esp+48] /* eax = wsize */ - neg ecx /* nbytes = -nbytes */ - mov esi, [esp+28] /* from = window */ - - cmp eax, ebp - jb L_invalid_distance_too_far /* if (dist > wsize) */ - - add ecx, ebp /* nbytes = dist - nbytes */ - cmp dword ptr [esp+52], 0 - jne L_wrap_around_window /* if (write != 0) */ - - sub eax, ecx - add esi, eax /* from += wsize - nbytes */ - - mov eax, [esp+64] /* eax = len */ - cmp eax, ecx - jbe L_do_copy /* if (nbytes >= len) */ - - sub eax, ecx /* len -= nbytes */ - rep movsb - mov esi, edi - sub esi, ebp /* from = out - dist */ - jmp L_do_copy - -ALIGN 4 -L_wrap_around_window: - mov eax, [esp+52] /* eax = write */ - cmp ecx, eax - jbe L_contiguous_in_window /* if (write >= nbytes) */ - - add esi, [esp+48] /* from += wsize */ - add esi, eax /* from += write */ - sub esi, ecx /* from -= nbytes */ - sub ecx, eax /* nbytes -= write */ - - mov eax, [esp+64] /* eax = len */ - cmp eax, ecx - jbe L_do_copy /* if (nbytes >= len) */ - - sub eax, ecx /* len -= nbytes */ - rep movsb - mov esi, [esp+28] /* from = window */ - mov ecx, [esp+52] /* nbytes = write */ - cmp eax, ecx - jbe L_do_copy /* if (nbytes >= len) */ - - sub eax, ecx /* len -= nbytes */ - rep movsb - mov esi, edi - sub esi, ebp /* from = out - dist */ - jmp L_do_copy - -ALIGN 4 -L_contiguous_in_window: - add esi, eax - sub esi, ecx /* from += write - nbytes */ - - mov eax, [esp+64] /* eax = len */ - cmp eax, ecx - jbe L_do_copy /* if (nbytes >= len) */ - - sub eax, ecx /* len -= nbytes */ - rep movsb - mov esi, edi - sub esi, ebp /* from = out - dist */ - jmp L_do_copy - -ALIGN 4 -L_do_copy: - mov ecx, eax - rep movsb - - mov esi, [esp+8] /* move in back to %esi, toss from */ - mov ebp, [esp+32] /* ebp = lcode */ - jmp L_while_test - -L_test_for_end_of_block: - test al, 32 - jz L_invalid_literal_length_code - mov dword ptr [esp+72], 1 - jmp L_break_loop_with_status - -L_invalid_literal_length_code: - mov dword ptr [esp+72], 2 - jmp L_break_loop_with_status - -L_invalid_distance_code: - mov dword ptr [esp+72], 3 - jmp L_break_loop_with_status - -L_invalid_distance_too_far: - mov esi, [esp+4] - mov dword ptr [esp+72], 4 - jmp L_break_loop_with_status - -L_break_loop: - mov dword ptr [esp+72], 0 - -L_break_loop_with_status: -/* put in, out, bits, and hold back into ar and pop esp */ - mov [esp+8], esi /* save in */ - mov [esp+16], edi /* save out */ - mov [esp+44], ebx /* save bits */ - mov [esp+40], edx /* save hold */ - mov ebp, [esp+4] /* restore esp, ebp */ - mov esp, [esp] - } -#else -#error "x86 architecture not defined" -#endif - - if (ar.status > 1) { - if (ar.status == 2) - strm->msg = "invalid literal/length code"; - else if (ar.status == 3) - strm->msg = "invalid distance code"; - else - strm->msg = "invalid distance too far back"; - state->mode = BAD; - } - else if ( ar.status == 1 ) { - state->mode = TYPE; - } - - /* return unused bytes (on entry, bits < 8, so in won't go too far back) */ - ar.len = ar.bits >> 3; - ar.in -= ar.len; - ar.bits -= ar.len << 3; - ar.hold &= (1U << ar.bits) - 1; - - /* update state and return */ - strm->next_in = ar.in; - strm->next_out = ar.out; - strm->avail_in = (unsigned)(ar.in < ar.last ? - PAD_AVAIL_IN + (ar.last - ar.in) : - PAD_AVAIL_IN - (ar.in - ar.last)); - strm->avail_out = (unsigned)(ar.out < ar.end ? - PAD_AVAIL_OUT + (ar.end - ar.out) : - PAD_AVAIL_OUT - (ar.out - ar.end)); - state->hold = ar.hold; - state->bits = ar.bits; - return; -} - diff --git a/contrib/inflate86/inffast.S b/contrib/inflate86/inffast.S deleted file mode 100644 index 2245a29..0000000 --- a/contrib/inflate86/inffast.S +++ /dev/null @@ -1,1368 +0,0 @@ -/* - * inffast.S is a hand tuned assembler version of: - * - * inffast.c -- fast decoding - * Copyright (C) 1995-2003 Mark Adler - * For conditions of distribution and use, see copyright notice in zlib.h - * - * Copyright (C) 2003 Chris Anderson - * Please use the copyright conditions above. - * - * This version (Jan-23-2003) of inflate_fast was coded and tested under - * GNU/Linux on a pentium 3, using the gcc-3.2 compiler distribution. On that - * machine, I found that gzip style archives decompressed about 20% faster than - * the gcc-3.2 -O3 -fomit-frame-pointer compiled version. Your results will - * depend on how large of a buffer is used for z_stream.next_in & next_out - * (8K-32K worked best for my 256K cpu cache) and how much overhead there is in - * stream processing I/O and crc32/addler32. In my case, this routine used - * 70% of the cpu time and crc32 used 20%. - * - * I am confident that this version will work in the general case, but I have - * not tested a wide variety of datasets or a wide variety of platforms. - * - * Jan-24-2003 -- Added -DUSE_MMX define for slightly faster inflating. - * It should be a runtime flag instead of compile time flag... - * - * Jan-26-2003 -- Added runtime check for MMX support with cpuid instruction. - * With -DUSE_MMX, only MMX code is compiled. With -DNO_MMX, only non-MMX code - * is compiled. Without either option, runtime detection is enabled. Runtime - * detection should work on all modern cpus and the recomended algorithm (flip - * ID bit on eflags and then use the cpuid instruction) is used in many - * multimedia applications. Tested under win2k with gcc-2.95 and gas-2.12 - * distributed with cygwin3. Compiling with gcc-2.95 -c inffast.S -o - * inffast.obj generates a COFF object which can then be linked with MSVC++ - * compiled code. Tested under FreeBSD 4.7 with gcc-2.95. - * - * Jan-28-2003 -- Tested Athlon XP... MMX mode is slower than no MMX (and - * slower than compiler generated code). Adjusted cpuid check to use the MMX - * code only for Pentiums < P4 until I have more data on the P4. Speed - * improvment is only about 15% on the Athlon when compared with code generated - * with MSVC++. Not sure yet, but I think the P4 will also be slower using the - * MMX mode because many of it's x86 ALU instructions execute in .5 cycles and - * have less latency than MMX ops. Added code to buffer the last 11 bytes of - * the input stream since the MMX code grabs bits in chunks of 32, which - * differs from the inffast.c algorithm. I don't think there would have been - * read overruns where a page boundary was crossed (a segfault), but there - * could have been overruns when next_in ends on unaligned memory (unintialized - * memory read). - * - * Mar-13-2003 -- P4 MMX is slightly slower than P4 NO_MMX. I created a C - * version of the non-MMX code so that it doesn't depend on zstrm and zstate - * structure offsets which are hard coded in this file. This was last tested - * with zlib-1.2.0 which is currently in beta testing, newer versions of this - * and inffas86.c can be found at http://www.eetbeetee.com/zlib/ and - * http://www.charm.net/~christop/zlib/ - */ - - -/* - * if you have underscore linking problems (_inflate_fast undefined), try - * using -DGAS_COFF - */ -#if ! defined( GAS_COFF ) && ! defined( GAS_ELF ) - -#if defined( WIN32 ) || defined( __CYGWIN__ ) -#define GAS_COFF /* windows object format */ -#else -#define GAS_ELF -#endif - -#endif /* ! GAS_COFF && ! GAS_ELF */ - - -#if defined( GAS_COFF ) - -/* coff externals have underscores */ -#define inflate_fast _inflate_fast -#define inflate_fast_use_mmx _inflate_fast_use_mmx - -#endif /* GAS_COFF */ - - -.file "inffast.S" - -.globl inflate_fast - -.text -.align 4,0 -.L_invalid_literal_length_code_msg: -.string "invalid literal/length code" - -.align 4,0 -.L_invalid_distance_code_msg: -.string "invalid distance code" - -.align 4,0 -.L_invalid_distance_too_far_msg: -.string "invalid distance too far back" - -#if ! defined( NO_MMX ) -.align 4,0 -.L_mask: /* mask[N] = ( 1 << N ) - 1 */ -.long 0 -.long 1 -.long 3 -.long 7 -.long 15 -.long 31 -.long 63 -.long 127 -.long 255 -.long 511 -.long 1023 -.long 2047 -.long 4095 -.long 8191 -.long 16383 -.long 32767 -.long 65535 -.long 131071 -.long 262143 -.long 524287 -.long 1048575 -.long 2097151 -.long 4194303 -.long 8388607 -.long 16777215 -.long 33554431 -.long 67108863 -.long 134217727 -.long 268435455 -.long 536870911 -.long 1073741823 -.long 2147483647 -.long 4294967295 -#endif /* NO_MMX */ - -.text - -/* - * struct z_stream offsets, in zlib.h - */ -#define next_in_strm 0 /* strm->next_in */ -#define avail_in_strm 4 /* strm->avail_in */ -#define next_out_strm 12 /* strm->next_out */ -#define avail_out_strm 16 /* strm->avail_out */ -#define msg_strm 24 /* strm->msg */ -#define state_strm 28 /* strm->state */ - -/* - * struct inflate_state offsets, in inflate.h - */ -#define mode_state 0 /* state->mode */ -#define wsize_state 32 /* state->wsize */ -#define write_state 40 /* state->write */ -#define window_state 44 /* state->window */ -#define hold_state 48 /* state->hold */ -#define bits_state 52 /* state->bits */ -#define lencode_state 68 /* state->lencode */ -#define distcode_state 72 /* state->distcode */ -#define lenbits_state 76 /* state->lenbits */ -#define distbits_state 80 /* state->distbits */ - -/* - * inflate_fast's activation record - */ -#define local_var_size 64 /* how much local space for vars */ -#define strm_sp 88 /* first arg: z_stream * (local_var_size + 24) */ -#define start_sp 92 /* second arg: unsigned int (local_var_size + 28) */ - -/* - * offsets for local vars on stack - */ -#define out 60 /* unsigned char* */ -#define window 56 /* unsigned char* */ -#define wsize 52 /* unsigned int */ -#define write 48 /* unsigned int */ -#define in 44 /* unsigned char* */ -#define beg 40 /* unsigned char* */ -#define buf 28 /* char[ 12 ] */ -#define len 24 /* unsigned int */ -#define last 20 /* unsigned char* */ -#define end 16 /* unsigned char* */ -#define dcode 12 /* code* */ -#define lcode 8 /* code* */ -#define dmask 4 /* unsigned int */ -#define lmask 0 /* unsigned int */ - -/* - * typedef enum inflate_mode consts, in inflate.h - */ -#define INFLATE_MODE_TYPE 11 /* state->mode flags enum-ed in inflate.h */ -#define INFLATE_MODE_BAD 26 - - -#if ! defined( USE_MMX ) && ! defined( NO_MMX ) - -#define RUN_TIME_MMX - -#define CHECK_MMX 1 -#define DO_USE_MMX 2 -#define DONT_USE_MMX 3 - -.globl inflate_fast_use_mmx - -.data - -.align 4,0 -inflate_fast_use_mmx: /* integer flag for run time control 1=check,2=mmx,3=no */ -.long CHECK_MMX - -#if defined( GAS_ELF ) -/* elf info */ -.type inflate_fast_use_mmx,@object -.size inflate_fast_use_mmx,4 -#endif - -#endif /* RUN_TIME_MMX */ - -#if defined( GAS_COFF ) -/* coff info: scl 2 = extern, type 32 = function */ -.def inflate_fast; .scl 2; .type 32; .endef -#endif - -.text - -.align 32,0x90 -inflate_fast: - pushl %edi - pushl %esi - pushl %ebp - pushl %ebx - pushf /* save eflags (strm_sp, state_sp assumes this is 32 bits) */ - subl $local_var_size, %esp - cld - -#define strm_r %esi -#define state_r %edi - - movl strm_sp(%esp), strm_r - movl state_strm(strm_r), state_r - - /* in = strm->next_in; - * out = strm->next_out; - * last = in + strm->avail_in - 11; - * beg = out - (start - strm->avail_out); - * end = out + (strm->avail_out - 257); - */ - movl avail_in_strm(strm_r), %edx - movl next_in_strm(strm_r), %eax - - addl %eax, %edx /* avail_in += next_in */ - subl $11, %edx /* avail_in -= 11 */ - - movl %eax, in(%esp) - movl %edx, last(%esp) - - movl start_sp(%esp), %ebp - movl avail_out_strm(strm_r), %ecx - movl next_out_strm(strm_r), %ebx - - subl %ecx, %ebp /* start -= avail_out */ - negl %ebp /* start = -start */ - addl %ebx, %ebp /* start += next_out */ - - subl $257, %ecx /* avail_out -= 257 */ - addl %ebx, %ecx /* avail_out += out */ - - movl %ebx, out(%esp) - movl %ebp, beg(%esp) - movl %ecx, end(%esp) - - /* wsize = state->wsize; - * write = state->write; - * window = state->window; - * hold = state->hold; - * bits = state->bits; - * lcode = state->lencode; - * dcode = state->distcode; - * lmask = ( 1 << state->lenbits ) - 1; - * dmask = ( 1 << state->distbits ) - 1; - */ - - movl lencode_state(state_r), %eax - movl distcode_state(state_r), %ecx - - movl %eax, lcode(%esp) - movl %ecx, dcode(%esp) - - movl $1, %eax - movl lenbits_state(state_r), %ecx - shll %cl, %eax - decl %eax - movl %eax, lmask(%esp) - - movl $1, %eax - movl distbits_state(state_r), %ecx - shll %cl, %eax - decl %eax - movl %eax, dmask(%esp) - - movl wsize_state(state_r), %eax - movl write_state(state_r), %ecx - movl window_state(state_r), %edx - - movl %eax, wsize(%esp) - movl %ecx, write(%esp) - movl %edx, window(%esp) - - movl hold_state(state_r), %ebp - movl bits_state(state_r), %ebx - -#undef strm_r -#undef state_r - -#define in_r %esi -#define from_r %esi -#define out_r %edi - - movl in(%esp), in_r - movl last(%esp), %ecx - cmpl in_r, %ecx - ja .L_align_long /* if in < last */ - - addl $11, %ecx /* ecx = &in[ avail_in ] */ - subl in_r, %ecx /* ecx = avail_in */ - movl $12, %eax - subl %ecx, %eax /* eax = 12 - avail_in */ - leal buf(%esp), %edi - rep movsb /* memcpy( buf, in, avail_in ) */ - movl %eax, %ecx - xorl %eax, %eax - rep stosb /* memset( &buf[ avail_in ], 0, 12 - avail_in ) */ - leal buf(%esp), in_r /* in = buf */ - movl in_r, last(%esp) /* last = in, do just one iteration */ - jmp .L_is_aligned - - /* align in_r on long boundary */ -.L_align_long: - testl $3, in_r - jz .L_is_aligned - xorl %eax, %eax - movb (in_r), %al - incl in_r - movl %ebx, %ecx - addl $8, %ebx - shll %cl, %eax - orl %eax, %ebp - jmp .L_align_long - -.L_is_aligned: - movl out(%esp), out_r - -#if defined( NO_MMX ) - jmp .L_do_loop -#endif - -#if defined( USE_MMX ) - jmp .L_init_mmx -#endif - -/*** Runtime MMX check ***/ - -#if defined( RUN_TIME_MMX ) -.L_check_mmx: - cmpl $DO_USE_MMX, inflate_fast_use_mmx - je .L_init_mmx - ja .L_do_loop /* > 2 */ - - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - pushf - movl (%esp), %eax /* copy eflags to eax */ - xorl $0x200000, (%esp) /* try toggling ID bit of eflags (bit 21) - * to see if cpu supports cpuid... - * ID bit method not supported by NexGen but - * bios may load a cpuid instruction and - * cpuid may be disabled on Cyrix 5-6x86 */ - popf - pushf - popl %edx /* copy new eflags to edx */ - xorl %eax, %edx /* test if ID bit is flipped */ - jz .L_dont_use_mmx /* not flipped if zero */ - xorl %eax, %eax - cpuid - cmpl $0x756e6547, %ebx /* check for GenuineIntel in ebx,ecx,edx */ - jne .L_dont_use_mmx - cmpl $0x6c65746e, %ecx - jne .L_dont_use_mmx - cmpl $0x49656e69, %edx - jne .L_dont_use_mmx - movl $1, %eax - cpuid /* get cpu features */ - shrl $8, %eax - andl $15, %eax - cmpl $6, %eax /* check for Pentium family, is 0xf for P4 */ - jne .L_dont_use_mmx - testl $0x800000, %edx /* test if MMX feature is set (bit 23) */ - jnz .L_use_mmx - jmp .L_dont_use_mmx -.L_use_mmx: - movl $DO_USE_MMX, inflate_fast_use_mmx - jmp .L_check_mmx_pop -.L_dont_use_mmx: - movl $DONT_USE_MMX, inflate_fast_use_mmx -.L_check_mmx_pop: - popl %edx - popl %ecx - popl %ebx - popl %eax - jmp .L_check_mmx -#endif - - -/*** Non-MMX code ***/ - -#if defined ( NO_MMX ) || defined( RUN_TIME_MMX ) - -#define hold_r %ebp -#define bits_r %bl -#define bitslong_r %ebx - -.align 32,0x90 -.L_while_test: - /* while (in < last && out < end) - */ - cmpl out_r, end(%esp) - jbe .L_break_loop /* if (out >= end) */ - - cmpl in_r, last(%esp) - jbe .L_break_loop - -.L_do_loop: - /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out - * - * do { - * if (bits < 15) { - * hold |= *((unsigned short *)in)++ << bits; - * bits += 16 - * } - * this = lcode[hold & lmask] - */ - cmpb $15, bits_r - ja .L_get_length_code /* if (15 < bits) */ - - xorl %eax, %eax - lodsw /* al = *(ushort *)in++ */ - movb bits_r, %cl /* cl = bits, needs it for shifting */ - addb $16, bits_r /* bits += 16 */ - shll %cl, %eax - orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */ - -.L_get_length_code: - movl lmask(%esp), %edx /* edx = lmask */ - movl lcode(%esp), %ecx /* ecx = lcode */ - andl hold_r, %edx /* edx &= hold */ - movl (%ecx,%edx,4), %eax /* eax = lcode[hold & lmask] */ - -.L_dolen: - /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out - * - * dolen: - * bits -= this.bits; - * hold >>= this.bits - */ - movb %ah, %cl /* cl = this.bits */ - subb %ah, bits_r /* bits -= this.bits */ - shrl %cl, hold_r /* hold >>= this.bits */ - - /* check if op is a literal - * if (op == 0) { - * PUP(out) = this.val; - * } - */ - testb %al, %al - jnz .L_test_for_length_base /* if (op != 0) 45.7% */ - - shrl $16, %eax /* output this.val char */ - stosb - jmp .L_while_test - -.L_test_for_length_base: - /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out, %edx = len - * - * else if (op & 16) { - * len = this.val - * op &= 15 - * if (op) { - * if (op > bits) { - * hold |= *((unsigned short *)in)++ << bits; - * bits += 16 - * } - * len += hold & mask[op]; - * bits -= op; - * hold >>= op; - * } - */ -#define len_r %edx - movl %eax, len_r /* len = this */ - shrl $16, len_r /* len = this.val */ - movb %al, %cl - - testb $16, %al - jz .L_test_for_second_level_length /* if ((op & 16) == 0) 8% */ - andb $15, %cl /* op &= 15 */ - jz .L_save_len /* if (!op) */ - cmpb %cl, bits_r - jae .L_add_bits_to_len /* if (op <= bits) */ - - movb %cl, %ch /* stash op in ch, freeing cl */ - xorl %eax, %eax - lodsw /* al = *(ushort *)in++ */ - movb bits_r, %cl /* cl = bits, needs it for shifting */ - addb $16, bits_r /* bits += 16 */ - shll %cl, %eax - orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */ - movb %ch, %cl /* move op back to ecx */ - -.L_add_bits_to_len: - movl $1, %eax - shll %cl, %eax - decl %eax - subb %cl, bits_r - andl hold_r, %eax /* eax &= hold */ - shrl %cl, hold_r - addl %eax, len_r /* len += hold & mask[op] */ - -.L_save_len: - movl len_r, len(%esp) /* save len */ -#undef len_r - -.L_decode_distance: - /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out, %edx = dist - * - * if (bits < 15) { - * hold |= *((unsigned short *)in)++ << bits; - * bits += 16 - * } - * this = dcode[hold & dmask]; - * dodist: - * bits -= this.bits; - * hold >>= this.bits; - * op = this.op; - */ - - cmpb $15, bits_r - ja .L_get_distance_code /* if (15 < bits) */ - - xorl %eax, %eax - lodsw /* al = *(ushort *)in++ */ - movb bits_r, %cl /* cl = bits, needs it for shifting */ - addb $16, bits_r /* bits += 16 */ - shll %cl, %eax - orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */ - -.L_get_distance_code: - movl dmask(%esp), %edx /* edx = dmask */ - movl dcode(%esp), %ecx /* ecx = dcode */ - andl hold_r, %edx /* edx &= hold */ - movl (%ecx,%edx,4), %eax /* eax = dcode[hold & dmask] */ - -#define dist_r %edx -.L_dodist: - movl %eax, dist_r /* dist = this */ - shrl $16, dist_r /* dist = this.val */ - movb %ah, %cl - subb %ah, bits_r /* bits -= this.bits */ - shrl %cl, hold_r /* hold >>= this.bits */ - - /* if (op & 16) { - * dist = this.val - * op &= 15 - * if (op > bits) { - * hold |= *((unsigned short *)in)++ << bits; - * bits += 16 - * } - * dist += hold & mask[op]; - * bits -= op; - * hold >>= op; - */ - movb %al, %cl /* cl = this.op */ - - testb $16, %al /* if ((op & 16) == 0) */ - jz .L_test_for_second_level_dist - andb $15, %cl /* op &= 15 */ - jz .L_check_dist_one - cmpb %cl, bits_r - jae .L_add_bits_to_dist /* if (op <= bits) 97.6% */ - - movb %cl, %ch /* stash op in ch, freeing cl */ - xorl %eax, %eax - lodsw /* al = *(ushort *)in++ */ - movb bits_r, %cl /* cl = bits, needs it for shifting */ - addb $16, bits_r /* bits += 16 */ - shll %cl, %eax - orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */ - movb %ch, %cl /* move op back to ecx */ - -.L_add_bits_to_dist: - movl $1, %eax - shll %cl, %eax - decl %eax /* (1 << op) - 1 */ - subb %cl, bits_r - andl hold_r, %eax /* eax &= hold */ - shrl %cl, hold_r - addl %eax, dist_r /* dist += hold & ((1 << op) - 1) */ - jmp .L_check_window - -.L_check_window: - /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist - * %ecx = nbytes - * - * nbytes = out - beg; - * if (dist <= nbytes) { - * from = out - dist; - * do { - * PUP(out) = PUP(from); - * } while (--len > 0) { - * } - */ - - movl in_r, in(%esp) /* save in so from can use it's reg */ - movl out_r, %eax - subl beg(%esp), %eax /* nbytes = out - beg */ - - cmpl dist_r, %eax - jb .L_clip_window /* if (dist > nbytes) 4.2% */ - - movl len(%esp), %ecx - movl out_r, from_r - subl dist_r, from_r /* from = out - dist */ - - subl $3, %ecx - movb (from_r), %al - movb %al, (out_r) - movb 1(from_r), %al - movb 2(from_r), %dl - addl $3, from_r - movb %al, 1(out_r) - movb %dl, 2(out_r) - addl $3, out_r - rep movsb - - movl in(%esp), in_r /* move in back to %esi, toss from */ - jmp .L_while_test - -.align 16,0x90 -.L_check_dist_one: - cmpl $1, dist_r - jne .L_check_window - cmpl out_r, beg(%esp) - je .L_check_window - - decl out_r - movl len(%esp), %ecx - movb (out_r), %al - subl $3, %ecx - - movb %al, 1(out_r) - movb %al, 2(out_r) - movb %al, 3(out_r) - addl $4, out_r - rep stosb - - jmp .L_while_test - -.align 16,0x90 -.L_test_for_second_level_length: - /* else if ((op & 64) == 0) { - * this = lcode[this.val + (hold & mask[op])]; - * } - */ - testb $64, %al - jnz .L_test_for_end_of_block /* if ((op & 64) != 0) */ - - movl $1, %eax - shll %cl, %eax - decl %eax - andl hold_r, %eax /* eax &= hold */ - addl %edx, %eax /* eax += this.val */ - movl lcode(%esp), %edx /* edx = lcode */ - movl (%edx,%eax,4), %eax /* eax = lcode[val + (hold&mask[op])] */ - jmp .L_dolen - -.align 16,0x90 -.L_test_for_second_level_dist: - /* else if ((op & 64) == 0) { - * this = dcode[this.val + (hold & mask[op])]; - * } - */ - testb $64, %al - jnz .L_invalid_distance_code /* if ((op & 64) != 0) */ - - movl $1, %eax - shll %cl, %eax - decl %eax - andl hold_r, %eax /* eax &= hold */ - addl %edx, %eax /* eax += this.val */ - movl dcode(%esp), %edx /* edx = dcode */ - movl (%edx,%eax,4), %eax /* eax = dcode[val + (hold&mask[op])] */ - jmp .L_dodist - -.align 16,0x90 -.L_clip_window: - /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist - * %ecx = nbytes - * - * else { - * if (dist > wsize) { - * invalid distance - * } - * from = window; - * nbytes = dist - nbytes; - * if (write == 0) { - * from += wsize - nbytes; - */ -#define nbytes_r %ecx - movl %eax, nbytes_r - movl wsize(%esp), %eax /* prepare for dist compare */ - negl nbytes_r /* nbytes = -nbytes */ - movl window(%esp), from_r /* from = window */ - - cmpl dist_r, %eax - jb .L_invalid_distance_too_far /* if (dist > wsize) */ - - addl dist_r, nbytes_r /* nbytes = dist - nbytes */ - cmpl $0, write(%esp) - jne .L_wrap_around_window /* if (write != 0) */ - - subl nbytes_r, %eax - addl %eax, from_r /* from += wsize - nbytes */ - - /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist - * %ecx = nbytes, %eax = len - * - * if (nbytes < len) { - * len -= nbytes; - * do { - * PUP(out) = PUP(from); - * } while (--nbytes); - * from = out - dist; - * } - * } - */ -#define len_r %eax - movl len(%esp), len_r - cmpl nbytes_r, len_r - jbe .L_do_copy1 /* if (nbytes >= len) */ - - subl nbytes_r, len_r /* len -= nbytes */ - rep movsb - movl out_r, from_r - subl dist_r, from_r /* from = out - dist */ - jmp .L_do_copy1 - - cmpl nbytes_r, len_r - jbe .L_do_copy1 /* if (nbytes >= len) */ - - subl nbytes_r, len_r /* len -= nbytes */ - rep movsb - movl out_r, from_r - subl dist_r, from_r /* from = out - dist */ - jmp .L_do_copy1 - -.L_wrap_around_window: - /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist - * %ecx = nbytes, %eax = write, %eax = len - * - * else if (write < nbytes) { - * from += wsize + write - nbytes; - * nbytes -= write; - * if (nbytes < len) { - * len -= nbytes; - * do { - * PUP(out) = PUP(from); - * } while (--nbytes); - * from = window; - * nbytes = write; - * if (nbytes < len) { - * len -= nbytes; - * do { - * PUP(out) = PUP(from); - * } while(--nbytes); - * from = out - dist; - * } - * } - * } - */ -#define write_r %eax - movl write(%esp), write_r - cmpl write_r, nbytes_r - jbe .L_contiguous_in_window /* if (write >= nbytes) */ - - addl wsize(%esp), from_r - addl write_r, from_r - subl nbytes_r, from_r /* from += wsize + write - nbytes */ - subl write_r, nbytes_r /* nbytes -= write */ -#undef write_r - - movl len(%esp), len_r - cmpl nbytes_r, len_r - jbe .L_do_copy1 /* if (nbytes >= len) */ - - subl nbytes_r, len_r /* len -= nbytes */ - rep movsb - movl window(%esp), from_r /* from = window */ - movl write(%esp), nbytes_r /* nbytes = write */ - cmpl nbytes_r, len_r - jbe .L_do_copy1 /* if (nbytes >= len) */ - - subl nbytes_r, len_r /* len -= nbytes */ - rep movsb - movl out_r, from_r - subl dist_r, from_r /* from = out - dist */ - jmp .L_do_copy1 - -.L_contiguous_in_window: - /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist - * %ecx = nbytes, %eax = write, %eax = len - * - * else { - * from += write - nbytes; - * if (nbytes < len) { - * len -= nbytes; - * do { - * PUP(out) = PUP(from); - * } while (--nbytes); - * from = out - dist; - * } - * } - */ -#define write_r %eax - addl write_r, from_r - subl nbytes_r, from_r /* from += write - nbytes */ -#undef write_r - - movl len(%esp), len_r - cmpl nbytes_r, len_r - jbe .L_do_copy1 /* if (nbytes >= len) */ - - subl nbytes_r, len_r /* len -= nbytes */ - rep movsb - movl out_r, from_r - subl dist_r, from_r /* from = out - dist */ - -.L_do_copy1: - /* regs: %esi = from, %esi = in, %ebp = hold, %bl = bits, %edi = out - * %eax = len - * - * while (len > 0) { - * PUP(out) = PUP(from); - * len--; - * } - * } - * } while (in < last && out < end); - */ -#undef nbytes_r -#define in_r %esi - movl len_r, %ecx - rep movsb - - movl in(%esp), in_r /* move in back to %esi, toss from */ - jmp .L_while_test - -#undef len_r -#undef dist_r - -#endif /* NO_MMX || RUN_TIME_MMX */ - - -/*** MMX code ***/ - -#if defined( USE_MMX ) || defined( RUN_TIME_MMX ) - -.align 32,0x90 -.L_init_mmx: - emms - -#undef bits_r -#undef bitslong_r -#define bitslong_r %ebp -#define hold_mm %mm0 - movd %ebp, hold_mm - movl %ebx, bitslong_r - -#define used_mm %mm1 -#define dmask2_mm %mm2 -#define lmask2_mm %mm3 -#define lmask_mm %mm4 -#define dmask_mm %mm5 -#define tmp_mm %mm6 - - movd lmask(%esp), lmask_mm - movq lmask_mm, lmask2_mm - movd dmask(%esp), dmask_mm - movq dmask_mm, dmask2_mm - pxor used_mm, used_mm - movl lcode(%esp), %ebx /* ebx = lcode */ - jmp .L_do_loop_mmx - -.align 32,0x90 -.L_while_test_mmx: - /* while (in < last && out < end) - */ - cmpl out_r, end(%esp) - jbe .L_break_loop /* if (out >= end) */ - - cmpl in_r, last(%esp) - jbe .L_break_loop - -.L_do_loop_mmx: - psrlq used_mm, hold_mm /* hold_mm >>= last bit length */ - - cmpl $32, bitslong_r - ja .L_get_length_code_mmx /* if (32 < bits) */ - - movd bitslong_r, tmp_mm - movd (in_r), %mm7 - addl $4, in_r - psllq tmp_mm, %mm7 - addl $32, bitslong_r - por %mm7, hold_mm /* hold_mm |= *((uint *)in)++ << bits */ - -.L_get_length_code_mmx: - pand hold_mm, lmask_mm - movd lmask_mm, %eax - movq lmask2_mm, lmask_mm - movl (%ebx,%eax,4), %eax /* eax = lcode[hold & lmask] */ - -.L_dolen_mmx: - movzbl %ah, %ecx /* ecx = this.bits */ - movd %ecx, used_mm - subl %ecx, bitslong_r /* bits -= this.bits */ - - testb %al, %al - jnz .L_test_for_length_base_mmx /* if (op != 0) 45.7% */ - - shrl $16, %eax /* output this.val char */ - stosb - jmp .L_while_test_mmx - -.L_test_for_length_base_mmx: -#define len_r %edx - movl %eax, len_r /* len = this */ - shrl $16, len_r /* len = this.val */ - - testb $16, %al - jz .L_test_for_second_level_length_mmx /* if ((op & 16) == 0) 8% */ - andl $15, %eax /* op &= 15 */ - jz .L_decode_distance_mmx /* if (!op) */ - - psrlq used_mm, hold_mm /* hold_mm >>= last bit length */ - movd %eax, used_mm - movd hold_mm, %ecx - subl %eax, bitslong_r - andl .L_mask(,%eax,4), %ecx - addl %ecx, len_r /* len += hold & mask[op] */ - -.L_decode_distance_mmx: - psrlq used_mm, hold_mm /* hold_mm >>= last bit length */ - - cmpl $32, bitslong_r - ja .L_get_dist_code_mmx /* if (32 < bits) */ - - movd bitslong_r, tmp_mm - movd (in_r), %mm7 - addl $4, in_r - psllq tmp_mm, %mm7 - addl $32, bitslong_r - por %mm7, hold_mm /* hold_mm |= *((uint *)in)++ << bits */ - -.L_get_dist_code_mmx: - movl dcode(%esp), %ebx /* ebx = dcode */ - pand hold_mm, dmask_mm - movd dmask_mm, %eax - movq dmask2_mm, dmask_mm - movl (%ebx,%eax,4), %eax /* eax = dcode[hold & lmask] */ - -.L_dodist_mmx: -#define dist_r %ebx - movzbl %ah, %ecx /* ecx = this.bits */ - movl %eax, dist_r - shrl $16, dist_r /* dist = this.val */ - subl %ecx, bitslong_r /* bits -= this.bits */ - movd %ecx, used_mm - - testb $16, %al /* if ((op & 16) == 0) */ - jz .L_test_for_second_level_dist_mmx - andl $15, %eax /* op &= 15 */ - jz .L_check_dist_one_mmx - -.L_add_bits_to_dist_mmx: - psrlq used_mm, hold_mm /* hold_mm >>= last bit length */ - movd %eax, used_mm /* save bit length of current op */ - movd hold_mm, %ecx /* get the next bits on input stream */ - subl %eax, bitslong_r /* bits -= op bits */ - andl .L_mask(,%eax,4), %ecx /* ecx = hold & mask[op] */ - addl %ecx, dist_r /* dist += hold & mask[op] */ - -.L_check_window_mmx: - movl in_r, in(%esp) /* save in so from can use it's reg */ - movl out_r, %eax - subl beg(%esp), %eax /* nbytes = out - beg */ - - cmpl dist_r, %eax - jb .L_clip_window_mmx /* if (dist > nbytes) 4.2% */ - - movl len_r, %ecx - movl out_r, from_r - subl dist_r, from_r /* from = out - dist */ - - subl $3, %ecx - movb (from_r), %al - movb %al, (out_r) - movb 1(from_r), %al - movb 2(from_r), %dl - addl $3, from_r - movb %al, 1(out_r) - movb %dl, 2(out_r) - addl $3, out_r - rep movsb - - movl in(%esp), in_r /* move in back to %esi, toss from */ - movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */ - jmp .L_while_test_mmx - -.align 16,0x90 -.L_check_dist_one_mmx: - cmpl $1, dist_r - jne .L_check_window_mmx - cmpl out_r, beg(%esp) - je .L_check_window_mmx - - decl out_r - movl len_r, %ecx - movb (out_r), %al - subl $3, %ecx - - movb %al, 1(out_r) - movb %al, 2(out_r) - movb %al, 3(out_r) - addl $4, out_r - rep stosb - - movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */ - jmp .L_while_test_mmx - -.align 16,0x90 -.L_test_for_second_level_length_mmx: - testb $64, %al - jnz .L_test_for_end_of_block /* if ((op & 64) != 0) */ - - andl $15, %eax - psrlq used_mm, hold_mm /* hold_mm >>= last bit length */ - movd hold_mm, %ecx - andl .L_mask(,%eax,4), %ecx - addl len_r, %ecx - movl (%ebx,%ecx,4), %eax /* eax = lcode[hold & lmask] */ - jmp .L_dolen_mmx - -.align 16,0x90 -.L_test_for_second_level_dist_mmx: - testb $64, %al - jnz .L_invalid_distance_code /* if ((op & 64) != 0) */ - - andl $15, %eax - psrlq used_mm, hold_mm /* hold_mm >>= last bit length */ - movd hold_mm, %ecx - andl .L_mask(,%eax,4), %ecx - movl dcode(%esp), %eax /* ecx = dcode */ - addl dist_r, %ecx - movl (%eax,%ecx,4), %eax /* eax = lcode[hold & lmask] */ - jmp .L_dodist_mmx - -.align 16,0x90 -.L_clip_window_mmx: -#define nbytes_r %ecx - movl %eax, nbytes_r - movl wsize(%esp), %eax /* prepare for dist compare */ - negl nbytes_r /* nbytes = -nbytes */ - movl window(%esp), from_r /* from = window */ - - cmpl dist_r, %eax - jb .L_invalid_distance_too_far /* if (dist > wsize) */ - - addl dist_r, nbytes_r /* nbytes = dist - nbytes */ - cmpl $0, write(%esp) - jne .L_wrap_around_window_mmx /* if (write != 0) */ - - subl nbytes_r, %eax - addl %eax, from_r /* from += wsize - nbytes */ - - cmpl nbytes_r, len_r - jbe .L_do_copy1_mmx /* if (nbytes >= len) */ - - subl nbytes_r, len_r /* len -= nbytes */ - rep movsb - movl out_r, from_r - subl dist_r, from_r /* from = out - dist */ - jmp .L_do_copy1_mmx - - cmpl nbytes_r, len_r - jbe .L_do_copy1_mmx /* if (nbytes >= len) */ - - subl nbytes_r, len_r /* len -= nbytes */ - rep movsb - movl out_r, from_r - subl dist_r, from_r /* from = out - dist */ - jmp .L_do_copy1_mmx - -.L_wrap_around_window_mmx: -#define write_r %eax - movl write(%esp), write_r - cmpl write_r, nbytes_r - jbe .L_contiguous_in_window_mmx /* if (write >= nbytes) */ - - addl wsize(%esp), from_r - addl write_r, from_r - subl nbytes_r, from_r /* from += wsize + write - nbytes */ - subl write_r, nbytes_r /* nbytes -= write */ -#undef write_r - - cmpl nbytes_r, len_r - jbe .L_do_copy1_mmx /* if (nbytes >= len) */ - - subl nbytes_r, len_r /* len -= nbytes */ - rep movsb - movl window(%esp), from_r /* from = window */ - movl write(%esp), nbytes_r /* nbytes = write */ - cmpl nbytes_r, len_r - jbe .L_do_copy1_mmx /* if (nbytes >= len) */ - - subl nbytes_r, len_r /* len -= nbytes */ - rep movsb - movl out_r, from_r - subl dist_r, from_r /* from = out - dist */ - jmp .L_do_copy1_mmx - -.L_contiguous_in_window_mmx: -#define write_r %eax - addl write_r, from_r - subl nbytes_r, from_r /* from += write - nbytes */ -#undef write_r - - cmpl nbytes_r, len_r - jbe .L_do_copy1_mmx /* if (nbytes >= len) */ - - subl nbytes_r, len_r /* len -= nbytes */ - rep movsb - movl out_r, from_r - subl dist_r, from_r /* from = out - dist */ - -.L_do_copy1_mmx: -#undef nbytes_r -#define in_r %esi - movl len_r, %ecx - rep movsb - - movl in(%esp), in_r /* move in back to %esi, toss from */ - movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */ - jmp .L_while_test_mmx - -#undef hold_r -#undef bitslong_r - -#endif /* USE_MMX || RUN_TIME_MMX */ - - -/*** USE_MMX, NO_MMX, and RUNTIME_MMX from here on ***/ - -.L_invalid_distance_code: - /* else { - * strm->msg = "invalid distance code"; - * state->mode = BAD; - * } - */ - movl $.L_invalid_distance_code_msg, %ecx - movl $INFLATE_MODE_BAD, %edx - jmp .L_update_stream_state - -.L_test_for_end_of_block: - /* else if (op & 32) { - * state->mode = TYPE; - * break; - * } - */ - testb $32, %al - jz .L_invalid_literal_length_code /* if ((op & 32) == 0) */ - - movl $0, %ecx - movl $INFLATE_MODE_TYPE, %edx - jmp .L_update_stream_state - -.L_invalid_literal_length_code: - /* else { - * strm->msg = "invalid literal/length code"; - * state->mode = BAD; - * } - */ - movl $.L_invalid_literal_length_code_msg, %ecx - movl $INFLATE_MODE_BAD, %edx - jmp .L_update_stream_state - -.L_invalid_distance_too_far: - /* strm->msg = "invalid distance too far back"; - * state->mode = BAD; - */ - movl in(%esp), in_r /* from_r has in's reg, put in back */ - movl $.L_invalid_distance_too_far_msg, %ecx - movl $INFLATE_MODE_BAD, %edx - jmp .L_update_stream_state - -.L_update_stream_state: - /* set strm->msg = %ecx, strm->state->mode = %edx */ - movl strm_sp(%esp), %eax - testl %ecx, %ecx /* if (msg != NULL) */ - jz .L_skip_msg - movl %ecx, msg_strm(%eax) /* strm->msg = msg */ -.L_skip_msg: - movl state_strm(%eax), %eax /* state = strm->state */ - movl %edx, mode_state(%eax) /* state->mode = edx (BAD | TYPE) */ - jmp .L_break_loop - -.align 32,0x90 -.L_break_loop: - -/* - * Regs: - * - * bits = %ebp when mmx, and in %ebx when non-mmx - * hold = %hold_mm when mmx, and in %ebp when non-mmx - * in = %esi - * out = %edi - */ - -#if defined( USE_MMX ) || defined( RUN_TIME_MMX ) - -#if defined( RUN_TIME_MMX ) - - cmpl $DO_USE_MMX, inflate_fast_use_mmx - jne .L_update_next_in - -#endif /* RUN_TIME_MMX */ - - movl %ebp, %ebx - -.L_update_next_in: - -#endif - -#define strm_r %eax -#define state_r %edx - - /* len = bits >> 3; - * in -= len; - * bits -= len << 3; - * hold &= (1U << bits) - 1; - * state->hold = hold; - * state->bits = bits; - * strm->next_in = in; - * strm->next_out = out; - */ - movl strm_sp(%esp), strm_r - movl %ebx, %ecx - movl state_strm(strm_r), state_r - shrl $3, %ecx - subl %ecx, in_r - shll $3, %ecx - subl %ecx, %ebx - movl out_r, next_out_strm(strm_r) - movl %ebx, bits_state(state_r) - movl %ebx, %ecx - - leal buf(%esp), %ebx - cmpl %ebx, last(%esp) - jne .L_buf_not_used /* if buf != last */ - - subl %ebx, in_r /* in -= buf */ - movl next_in_strm(strm_r), %ebx - movl %ebx, last(%esp) /* last = strm->next_in */ - addl %ebx, in_r /* in += strm->next_in */ - movl avail_in_strm(strm_r), %ebx - subl $11, %ebx - addl %ebx, last(%esp) /* last = &strm->next_in[ avail_in - 11 ] */ - -.L_buf_not_used: - movl in_r, next_in_strm(strm_r) - - movl $1, %ebx - shll %cl, %ebx - decl %ebx - -#if defined( USE_MMX ) || defined( RUN_TIME_MMX ) - -#if defined( RUN_TIME_MMX ) - - cmpl $DO_USE_MMX, inflate_fast_use_mmx - jne .L_update_hold - -#endif /* RUN_TIME_MMX */ - - psrlq used_mm, hold_mm /* hold_mm >>= last bit length */ - movd hold_mm, %ebp - - emms - -.L_update_hold: - -#endif /* USE_MMX || RUN_TIME_MMX */ - - andl %ebx, %ebp - movl %ebp, hold_state(state_r) - -#define last_r %ebx - - /* strm->avail_in = in < last ? 11 + (last - in) : 11 - (in - last) */ - movl last(%esp), last_r - cmpl in_r, last_r - jbe .L_last_is_smaller /* if (in >= last) */ - - subl in_r, last_r /* last -= in */ - addl $11, last_r /* last += 11 */ - movl last_r, avail_in_strm(strm_r) - jmp .L_fixup_out -.L_last_is_smaller: - subl last_r, in_r /* in -= last */ - negl in_r /* in = -in */ - addl $11, in_r /* in += 11 */ - movl in_r, avail_in_strm(strm_r) - -#undef last_r -#define end_r %ebx - -.L_fixup_out: - /* strm->avail_out = out < end ? 257 + (end - out) : 257 - (out - end)*/ - movl end(%esp), end_r - cmpl out_r, end_r - jbe .L_end_is_smaller /* if (out >= end) */ - - subl out_r, end_r /* end -= out */ - addl $257, end_r /* end += 257 */ - movl end_r, avail_out_strm(strm_r) - jmp .L_done -.L_end_is_smaller: - subl end_r, out_r /* out -= end */ - negl out_r /* out = -out */ - addl $257, out_r /* out += 257 */ - movl out_r, avail_out_strm(strm_r) - -#undef end_r -#undef strm_r -#undef state_r - -.L_done: - addl $local_var_size, %esp - popf - popl %ebx - popl %ebp - popl %esi - popl %edi - ret - -#if defined( GAS_ELF ) -/* elf info */ -.type inflate_fast,@function -.size inflate_fast,.-inflate_fast -#endif diff --git a/contrib/masmx64/bld_ml64.bat b/contrib/masmx64/bld_ml64.bat deleted file mode 100644 index 8f9343d..0000000 --- a/contrib/masmx64/bld_ml64.bat +++ /dev/null @@ -1,2 +0,0 @@ -ml64.exe /Flinffasx64 /c /Zi inffasx64.asm -ml64.exe /Flgvmat64 /c /Zi gvmat64.asm diff --git a/contrib/masmx64/gvmat64.asm b/contrib/masmx64/gvmat64.asm deleted file mode 100644 index 9879c28..0000000 --- a/contrib/masmx64/gvmat64.asm +++ /dev/null @@ -1,553 +0,0 @@ -;uInt longest_match_x64( -; deflate_state *s, -; IPos cur_match); /* current match */ - -; gvmat64.asm -- Asm portion of the optimized longest_match for 32 bits x86_64 -; (AMD64 on Athlon 64, Opteron, Phenom -; and Intel EM64T on Pentium 4 with EM64T, Pentium D, Core 2 Duo, Core I5/I7) -; Copyright (C) 1995-2010 Jean-loup Gailly, Brian Raiter and Gilles Vollant. -; -; File written by Gilles Vollant, by converting to assembly the longest_match -; from Jean-loup Gailly in deflate.c of zLib and infoZip zip. -; -; and by taking inspiration on asm686 with masm, optimised assembly code -; from Brian Raiter, written 1998 -; -; This software is provided 'as-is', without any express or implied -; warranty. In no event will the authors be held liable for any damages -; arising from the use of this software. -; -; Permission is granted to anyone to use this software for any purpose, -; including commercial applications, and to alter it and redistribute it -; freely, subject to the following restrictions: -; -; 1. The origin of this software must not be misrepresented; you must not -; claim that you wrote the original software. If you use this software -; in a product, an acknowledgment in the product documentation would be -; appreciated but is not required. -; 2. Altered source versions must be plainly marked as such, and must not be -; misrepresented as being the original software -; 3. This notice may not be removed or altered from any source distribution. -; -; -; -; http://www.zlib.net -; http://www.winimage.com/zLibDll -; http://www.muppetlabs.com/~breadbox/software/assembly.html -; -; to compile this file for infozip Zip, I use option: -; ml64.exe /Flgvmat64 /c /Zi /DINFOZIP gvmat64.asm -; -; to compile this file for zLib, I use option: -; ml64.exe /Flgvmat64 /c /Zi gvmat64.asm -; Be carrefull to adapt zlib1222add below to your version of zLib -; (if you use a version of zLib before 1.0.4 or after 1.2.2.2, change -; value of zlib1222add later) -; -; This file compile with Microsoft Macro Assembler (x64) for AMD64 -; -; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK -; -; (you can get Windows WDK with ml64 for AMD64 from -; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price) -; - - -;uInt longest_match(s, cur_match) -; deflate_state *s; -; IPos cur_match; /* current match */ -.code -longest_match PROC - - -;LocalVarsSize equ 88 - LocalVarsSize equ 72 - -; register used : rax,rbx,rcx,rdx,rsi,rdi,r8,r9,r10,r11,r12 -; free register : r14,r15 -; register can be saved : rsp - - chainlenwmask equ rsp + 8 - LocalVarsSize ; high word: current chain len - ; low word: s->wmask -;window equ rsp + xx - LocalVarsSize ; local copy of s->window ; stored in r10 -;windowbestlen equ rsp + xx - LocalVarsSize ; s->window + bestlen , use r10+r11 -;scanstart equ rsp + xx - LocalVarsSize ; first two bytes of string ; stored in r12w -;scanend equ rsp + xx - LocalVarsSize ; last two bytes of string use ebx -;scanalign equ rsp + xx - LocalVarsSize ; dword-misalignment of string r13 -;bestlen equ rsp + xx - LocalVarsSize ; size of best match so far -> r11d -;scan equ rsp + xx - LocalVarsSize ; ptr to string wanting match -> r9 -IFDEF INFOZIP -ELSE - nicematch equ (rsp + 16 - LocalVarsSize) ; a good enough match size -ENDIF - -save_rdi equ rsp + 24 - LocalVarsSize -save_rsi equ rsp + 32 - LocalVarsSize -save_rbx equ rsp + 40 - LocalVarsSize -save_rbp equ rsp + 48 - LocalVarsSize -save_r12 equ rsp + 56 - LocalVarsSize -save_r13 equ rsp + 64 - LocalVarsSize -;save_r14 equ rsp + 72 - LocalVarsSize -;save_r15 equ rsp + 80 - LocalVarsSize - - -; summary of register usage -; scanend ebx -; scanendw bx -; chainlenwmask edx -; curmatch rsi -; curmatchd esi -; windowbestlen r8 -; scanalign r9 -; scanalignd r9d -; window r10 -; bestlen r11 -; bestlend r11d -; scanstart r12d -; scanstartw r12w -; scan r13 -; nicematch r14d -; limit r15 -; limitd r15d -; prev rcx - -; all the +4 offsets are due to the addition of pending_buf_size (in zlib -; in the deflate_state structure since the asm code was first written -; (if you compile with zlib 1.0.4 or older, remove the +4). -; Note : these value are good with a 8 bytes boundary pack structure - - - MAX_MATCH equ 258 - MIN_MATCH equ 3 - MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) - - -;;; Offsets for fields in the deflate_state structure. These numbers -;;; are calculated from the definition of deflate_state, with the -;;; assumption that the compiler will dword-align the fields. (Thus, -;;; changing the definition of deflate_state could easily cause this -;;; program to crash horribly, without so much as a warning at -;;; compile time. Sigh.) - -; all the +zlib1222add offsets are due to the addition of fields -; in zlib in the deflate_state structure since the asm code was first written -; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). -; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). -; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). - - -IFDEF INFOZIP - -_DATA SEGMENT -COMM window_size:DWORD -; WMask ; 7fff -COMM window:BYTE:010040H -COMM prev:WORD:08000H -; MatchLen : unused -; PrevMatch : unused -COMM strstart:DWORD -COMM match_start:DWORD -; Lookahead : ignore -COMM prev_length:DWORD ; PrevLen -COMM max_chain_length:DWORD -COMM good_match:DWORD -COMM nice_match:DWORD -prev_ad equ OFFSET prev -window_ad equ OFFSET window -nicematch equ nice_match -_DATA ENDS -WMask equ 07fffh - -ELSE - - IFNDEF zlib1222add - zlib1222add equ 8 - ENDIF -dsWSize equ 56+zlib1222add+(zlib1222add/2) -dsWMask equ 64+zlib1222add+(zlib1222add/2) -dsWindow equ 72+zlib1222add -dsPrev equ 88+zlib1222add -dsMatchLen equ 128+zlib1222add -dsPrevMatch equ 132+zlib1222add -dsStrStart equ 140+zlib1222add -dsMatchStart equ 144+zlib1222add -dsLookahead equ 148+zlib1222add -dsPrevLen equ 152+zlib1222add -dsMaxChainLen equ 156+zlib1222add -dsGoodMatch equ 172+zlib1222add -dsNiceMatch equ 176+zlib1222add - -window_size equ [ rcx + dsWSize] -WMask equ [ rcx + dsWMask] -window_ad equ [ rcx + dsWindow] -prev_ad equ [ rcx + dsPrev] -strstart equ [ rcx + dsStrStart] -match_start equ [ rcx + dsMatchStart] -Lookahead equ [ rcx + dsLookahead] ; 0ffffffffh on infozip -prev_length equ [ rcx + dsPrevLen] -max_chain_length equ [ rcx + dsMaxChainLen] -good_match equ [ rcx + dsGoodMatch] -nice_match equ [ rcx + dsNiceMatch] -ENDIF - -; parameter 1 in r8(deflate state s), param 2 in rdx (cur match) - -; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and -; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp -; -; All registers must be preserved across the call, except for -; rax, rcx, rdx, r8, r9, r10, and r11, which are scratch. - - - -;;; Save registers that the compiler may be using, and adjust esp to -;;; make room for our stack frame. - - -;;; Retrieve the function arguments. r8d will hold cur_match -;;; throughout the entire function. edx will hold the pointer to the -;;; deflate_state structure during the function's setup (before -;;; entering the main loop. - -; parameter 1 in rcx (deflate_state* s), param 2 in edx -> r8 (cur match) - -; this clear high 32 bits of r8, which can be garbage in both r8 and rdx - - mov [save_rdi],rdi - mov [save_rsi],rsi - mov [save_rbx],rbx - mov [save_rbp],rbp -IFDEF INFOZIP - mov r8d,ecx -ELSE - mov r8d,edx -ENDIF - mov [save_r12],r12 - mov [save_r13],r13 -; mov [save_r14],r14 -; mov [save_r15],r15 - - -;;; uInt wmask = s->w_mask; -;;; unsigned chain_length = s->max_chain_length; -;;; if (s->prev_length >= s->good_match) { -;;; chain_length >>= 2; -;;; } - - mov edi, prev_length - mov esi, good_match - mov eax, WMask - mov ebx, max_chain_length - cmp edi, esi - jl LastMatchGood - shr ebx, 2 -LastMatchGood: - -;;; chainlen is decremented once beforehand so that the function can -;;; use the sign flag instead of the zero flag for the exit test. -;;; It is then shifted into the high word, to make room for the wmask -;;; value, which it will always accompany. - - dec ebx - shl ebx, 16 - or ebx, eax - -;;; on zlib only -;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; - -IFDEF INFOZIP - mov [chainlenwmask], ebx -; on infozip nice_match = [nice_match] -ELSE - mov eax, nice_match - mov [chainlenwmask], ebx - mov r10d, Lookahead - cmp r10d, eax - cmovnl r10d, eax - mov [nicematch],r10d -ENDIF - -;;; register Bytef *scan = s->window + s->strstart; - mov r10, window_ad - mov ebp, strstart - lea r13, [r10 + rbp] - -;;; Determine how many bytes the scan ptr is off from being -;;; dword-aligned. - - mov r9,r13 - neg r13 - and r13,3 - -;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? -;;; s->strstart - (IPos)MAX_DIST(s) : NIL; -IFDEF INFOZIP - mov eax,07efah ; MAX_DIST = (WSIZE-MIN_LOOKAHEAD) (0x8000-(3+8+1)) -ELSE - mov eax, window_size - sub eax, MIN_LOOKAHEAD -ENDIF - xor edi,edi - sub ebp, eax - - mov r11d, prev_length - - cmovng ebp,edi - -;;; int best_len = s->prev_length; - - -;;; Store the sum of s->window + best_len in esi locally, and in esi. - - lea rsi,[r10+r11] - -;;; register ush scan_start = *(ushf*)scan; -;;; register ush scan_end = *(ushf*)(scan+best_len-1); -;;; Posf *prev = s->prev; - - movzx r12d,word ptr [r9] - movzx ebx, word ptr [r9 + r11 - 1] - - mov rdi, prev_ad - -;;; Jump into the main loop. - - mov edx, [chainlenwmask] - - cmp bx,word ptr [rsi + r8 - 1] - jz LookupLoopIsZero - -LookupLoop1: - and r8d, edx - - movzx r8d, word ptr [rdi + r8*2] - cmp r8d, ebp - jbe LeaveNow - sub edx, 00010000h - js LeaveNow - -LoopEntry1: - cmp bx,word ptr [rsi + r8 - 1] - jz LookupLoopIsZero - -LookupLoop2: - and r8d, edx - - movzx r8d, word ptr [rdi + r8*2] - cmp r8d, ebp - jbe LeaveNow - sub edx, 00010000h - js LeaveNow - -LoopEntry2: - cmp bx,word ptr [rsi + r8 - 1] - jz LookupLoopIsZero - -LookupLoop4: - and r8d, edx - - movzx r8d, word ptr [rdi + r8*2] - cmp r8d, ebp - jbe LeaveNow - sub edx, 00010000h - js LeaveNow - -LoopEntry4: - - cmp bx,word ptr [rsi + r8 - 1] - jnz LookupLoop1 - jmp LookupLoopIsZero - - -;;; do { -;;; match = s->window + cur_match; -;;; if (*(ushf*)(match+best_len-1) != scan_end || -;;; *(ushf*)match != scan_start) continue; -;;; [...] -;;; } while ((cur_match = prev[cur_match & wmask]) > limit -;;; && --chain_length != 0); -;;; -;;; Here is the inner loop of the function. The function will spend the -;;; majority of its time in this loop, and majority of that time will -;;; be spent in the first ten instructions. -;;; -;;; Within this loop: -;;; ebx = scanend -;;; r8d = curmatch -;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) -;;; esi = windowbestlen - i.e., (window + bestlen) -;;; edi = prev -;;; ebp = limit - -LookupLoop: - and r8d, edx - - movzx r8d, word ptr [rdi + r8*2] - cmp r8d, ebp - jbe LeaveNow - sub edx, 00010000h - js LeaveNow - -LoopEntry: - - cmp bx,word ptr [rsi + r8 - 1] - jnz LookupLoop1 -LookupLoopIsZero: - cmp r12w, word ptr [r10 + r8] - jnz LookupLoop1 - - -;;; Store the current value of chainlen. - mov [chainlenwmask], edx - -;;; Point edi to the string under scrutiny, and esi to the string we -;;; are hoping to match it up with. In actuality, esi and edi are -;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is -;;; initialized to -(MAX_MATCH_8 - scanalign). - - lea rsi,[r8+r10] - mov rdx, 0fffffffffffffef8h; -(MAX_MATCH_8) - lea rsi, [rsi + r13 + 0108h] ;MAX_MATCH_8] - lea rdi, [r9 + r13 + 0108h] ;MAX_MATCH_8] - - prefetcht1 [rsi+rdx] - prefetcht1 [rdi+rdx] - - -;;; Test the strings for equality, 8 bytes at a time. At the end, -;;; adjust rdx so that it is offset to the exact byte that mismatched. -;;; -;;; We already know at this point that the first three bytes of the -;;; strings match each other, and they can be safely passed over before -;;; starting the compare loop. So what this code does is skip over 0-3 -;;; bytes, as much as necessary in order to dword-align the edi -;;; pointer. (rsi will still be misaligned three times out of four.) -;;; -;;; It should be confessed that this loop usually does not represent -;;; much of the total running time. Replacing it with a more -;;; straightforward "rep cmpsb" would not drastically degrade -;;; performance. - - -LoopCmps: - mov rax, [rsi + rdx] - xor rax, [rdi + rdx] - jnz LeaveLoopCmps - - mov rax, [rsi + rdx + 8] - xor rax, [rdi + rdx + 8] - jnz LeaveLoopCmps8 - - - mov rax, [rsi + rdx + 8+8] - xor rax, [rdi + rdx + 8+8] - jnz LeaveLoopCmps16 - - add rdx,8+8+8 - - jnz short LoopCmps - jmp short LenMaximum -LeaveLoopCmps16: add rdx,8 -LeaveLoopCmps8: add rdx,8 -LeaveLoopCmps: - - test eax, 0000FFFFh - jnz LenLower - - test eax,0ffffffffh - - jnz LenLower32 - - add rdx,4 - shr rax,32 - or ax,ax - jnz LenLower - -LenLower32: - shr eax,16 - add rdx,2 -LenLower: sub al, 1 - adc rdx, 0 -;;; Calculate the length of the match. If it is longer than MAX_MATCH, -;;; then automatically accept it as the best possible match and leave. - - lea rax, [rdi + rdx] - sub rax, r9 - cmp eax, MAX_MATCH - jge LenMaximum - -;;; If the length of the match is not longer than the best match we -;;; have so far, then forget it and return to the lookup loop. -;/////////////////////////////////// - - cmp eax, r11d - jg LongerMatch - - lea rsi,[r10+r11] - - mov rdi, prev_ad - mov edx, [chainlenwmask] - jmp LookupLoop - -;;; s->match_start = cur_match; -;;; best_len = len; -;;; if (len >= nice_match) break; -;;; scan_end = *(ushf*)(scan+best_len-1); - -LongerMatch: - mov r11d, eax - mov match_start, r8d - cmp eax, [nicematch] - jge LeaveNow - - lea rsi,[r10+rax] - - movzx ebx, word ptr [r9 + rax - 1] - mov rdi, prev_ad - mov edx, [chainlenwmask] - jmp LookupLoop - -;;; Accept the current string, with the maximum possible length. - -LenMaximum: - mov r11d,MAX_MATCH - mov match_start, r8d - -;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; -;;; return s->lookahead; - -LeaveNow: -IFDEF INFOZIP - mov eax,r11d -ELSE - mov eax, Lookahead - cmp r11d, eax - cmovng eax, r11d -ENDIF - -;;; Restore the stack and return from whence we came. - - - mov rsi,[save_rsi] - mov rdi,[save_rdi] - mov rbx,[save_rbx] - mov rbp,[save_rbp] - mov r12,[save_r12] - mov r13,[save_r13] -; mov r14,[save_r14] -; mov r15,[save_r15] - - - ret 0 -; please don't remove this string ! -; Your can freely use gvmat64 in any free or commercial app -; but it is far better don't remove the string in the binary! - db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998, converted to amd 64 by Gilles Vollant 2005",0dh,0ah,0 -longest_match ENDP - -match_init PROC - ret 0 -match_init ENDP - - -END diff --git a/contrib/masmx64/inffas8664.c b/contrib/masmx64/inffas8664.c deleted file mode 100644 index e8af06f..0000000 --- a/contrib/masmx64/inffas8664.c +++ /dev/null @@ -1,186 +0,0 @@ -/* inffas8664.c is a hand tuned assembler version of inffast.c - fast decoding - * version for AMD64 on Windows using Microsoft C compiler - * - * Copyright (C) 1995-2003 Mark Adler - * For conditions of distribution and use, see copyright notice in zlib.h - * - * Copyright (C) 2003 Chris Anderson - * Please use the copyright conditions above. - * - * 2005 - Adaptation to Microsoft C Compiler for AMD64 by Gilles Vollant - * - * inffas8664.c call function inffas8664fnc in inffasx64.asm - * inffasx64.asm is automatically convert from AMD64 portion of inffas86.c - * - * Dec-29-2003 -- I added AMD64 inflate asm support. This version is also - * slightly quicker on x86 systems because, instead of using rep movsb to copy - * data, it uses rep movsw, which moves data in 2-byte chunks instead of single - * bytes. I've tested the AMD64 code on a Fedora Core 1 + the x86_64 updates - * from http://fedora.linux.duke.edu/fc1_x86_64 - * which is running on an Athlon 64 3000+ / Gigabyte GA-K8VT800M system with - * 1GB ram. The 64-bit version is about 4% faster than the 32-bit version, - * when decompressing mozilla-source-1.3.tar.gz. - * - * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from - * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at - * the moment. I have successfully compiled and tested this code with gcc2.96, - * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S - * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX - * enabled. I will attempt to merge the MMX code into this version. Newer - * versions of this and inffast.S can be found at - * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/ - * - */ - -#include -#include "zutil.h" -#include "inftrees.h" -#include "inflate.h" -#include "inffast.h" - -/* Mark Adler's comments from inffast.c: */ - -/* - Decode literal, length, and distance codes and write out the resulting - literal and match bytes until either not enough input or output is - available, an end-of-block is encountered, or a data error is encountered. - When large enough input and output buffers are supplied to inflate(), for - example, a 16K input buffer and a 64K output buffer, more than 95% of the - inflate execution time is spent in this routine. - - Entry assumptions: - - state->mode == LEN - strm->avail_in >= 6 - strm->avail_out >= 258 - start >= strm->avail_out - state->bits < 8 - - On return, state->mode is one of: - - LEN -- ran out of enough output space or enough available input - TYPE -- reached end of block code, inflate() to interpret next block - BAD -- error in block data - - Notes: - - - The maximum input bits used by a length/distance pair is 15 bits for the - length code, 5 bits for the length extra, 15 bits for the distance code, - and 13 bits for the distance extra. This totals 48 bits, or six bytes. - Therefore if strm->avail_in >= 6, then there is enough input to avoid - checking for available input while decoding. - - - The maximum bytes that a single length/distance pair can output is 258 - bytes, which is the maximum length that can be coded. inflate_fast() - requires strm->avail_out >= 258 for each loop to avoid checking for - output space. - */ - - - - typedef struct inffast_ar { -/* 64 32 x86 x86_64 */ -/* ar offset register */ -/* 0 0 */ void *esp; /* esp save */ -/* 8 4 */ void *ebp; /* ebp save */ -/* 16 8 */ unsigned char FAR *in; /* esi rsi local strm->next_in */ -/* 24 12 */ unsigned char FAR *last; /* r9 while in < last */ -/* 32 16 */ unsigned char FAR *out; /* edi rdi local strm->next_out */ -/* 40 20 */ unsigned char FAR *beg; /* inflate()'s init next_out */ -/* 48 24 */ unsigned char FAR *end; /* r10 while out < end */ -/* 56 28 */ unsigned char FAR *window;/* size of window, wsize!=0 */ -/* 64 32 */ code const FAR *lcode; /* ebp rbp local strm->lencode */ -/* 72 36 */ code const FAR *dcode; /* r11 local strm->distcode */ -/* 80 40 */ size_t /*unsigned long */hold; /* edx rdx local strm->hold */ -/* 88 44 */ unsigned bits; /* ebx rbx local strm->bits */ -/* 92 48 */ unsigned wsize; /* window size */ -/* 96 52 */ unsigned write; /* window write index */ -/*100 56 */ unsigned lmask; /* r12 mask for lcode */ -/*104 60 */ unsigned dmask; /* r13 mask for dcode */ -/*108 64 */ unsigned len; /* r14 match length */ -/*112 68 */ unsigned dist; /* r15 match distance */ -/*116 72 */ unsigned status; /* set when state chng*/ - } type_ar; -#ifdef ASMINF - -void inflate_fast(strm, start) -z_streamp strm; -unsigned start; /* inflate()'s starting value for strm->avail_out */ -{ - struct inflate_state FAR *state; - type_ar ar; - void inffas8664fnc(struct inffast_ar * par); - - - -#if (defined( __GNUC__ ) && defined( __amd64__ ) && ! defined( __i386 )) || (defined(_MSC_VER) && defined(_M_AMD64)) -#define PAD_AVAIL_IN 6 -#define PAD_AVAIL_OUT 258 -#else -#define PAD_AVAIL_IN 5 -#define PAD_AVAIL_OUT 257 -#endif - - /* copy state to local variables */ - state = (struct inflate_state FAR *)strm->state; - - ar.in = strm->next_in; - ar.last = ar.in + (strm->avail_in - PAD_AVAIL_IN); - ar.out = strm->next_out; - ar.beg = ar.out - (start - strm->avail_out); - ar.end = ar.out + (strm->avail_out - PAD_AVAIL_OUT); - ar.wsize = state->wsize; - ar.write = state->wnext; - ar.window = state->window; - ar.hold = state->hold; - ar.bits = state->bits; - ar.lcode = state->lencode; - ar.dcode = state->distcode; - ar.lmask = (1U << state->lenbits) - 1; - ar.dmask = (1U << state->distbits) - 1; - - /* decode literals and length/distances until end-of-block or not enough - input data or output space */ - - /* align in on 1/2 hold size boundary */ - while (((size_t)(void *)ar.in & (sizeof(ar.hold) / 2 - 1)) != 0) { - ar.hold += (unsigned long)*ar.in++ << ar.bits; - ar.bits += 8; - } - - inffas8664fnc(&ar); - - if (ar.status > 1) { - if (ar.status == 2) - strm->msg = "invalid literal/length code"; - else if (ar.status == 3) - strm->msg = "invalid distance code"; - else - strm->msg = "invalid distance too far back"; - state->mode = BAD; - } - else if ( ar.status == 1 ) { - state->mode = TYPE; - } - - /* return unused bytes (on entry, bits < 8, so in won't go too far back) */ - ar.len = ar.bits >> 3; - ar.in -= ar.len; - ar.bits -= ar.len << 3; - ar.hold &= (1U << ar.bits) - 1; - - /* update state and return */ - strm->next_in = ar.in; - strm->next_out = ar.out; - strm->avail_in = (unsigned)(ar.in < ar.last ? - PAD_AVAIL_IN + (ar.last - ar.in) : - PAD_AVAIL_IN - (ar.in - ar.last)); - strm->avail_out = (unsigned)(ar.out < ar.end ? - PAD_AVAIL_OUT + (ar.end - ar.out) : - PAD_AVAIL_OUT - (ar.out - ar.end)); - state->hold = (unsigned long)ar.hold; - state->bits = ar.bits; - return; -} - -#endif diff --git a/contrib/masmx64/inffasx64.asm b/contrib/masmx64/inffasx64.asm deleted file mode 100644 index 60a8d89..0000000 --- a/contrib/masmx64/inffasx64.asm +++ /dev/null @@ -1,396 +0,0 @@ -; inffasx64.asm is a hand tuned assembler version of inffast.c - fast decoding -; version for AMD64 on Windows using Microsoft C compiler -; -; inffasx64.asm is automatically convert from AMD64 portion of inffas86.c -; inffasx64.asm is called by inffas8664.c, which contain more info. - - -; to compile this file, I use option -; ml64.exe /Flinffasx64 /c /Zi inffasx64.asm -; with Microsoft Macro Assembler (x64) for AMD64 -; - -; This file compile with Microsoft Macro Assembler (x64) for AMD64 -; -; ml64.exe is given with Visual Studio 2005/2008/2010 and Windows WDK -; -; (you can get Windows WDK with ml64 for AMD64 from -; http://www.microsoft.com/whdc/Devtools/wdk/default.mspx for low price) -; - - -.code -inffas8664fnc PROC - -; see http://weblogs.asp.net/oldnewthing/archive/2004/01/14/58579.aspx and -; http://msdn.microsoft.com/library/en-us/kmarch/hh/kmarch/64bitAMD_8e951dd2-ee77-4728-8702-55ce4b5dd24a.xml.asp -; -; All registers must be preserved across the call, except for -; rax, rcx, rdx, r8, r-9, r10, and r11, which are scratch. - - - mov [rsp-8],rsi - mov [rsp-16],rdi - mov [rsp-24],r12 - mov [rsp-32],r13 - mov [rsp-40],r14 - mov [rsp-48],r15 - mov [rsp-56],rbx - - mov rax,rcx - - mov [rax+8], rbp ; /* save regs rbp and rsp */ - mov [rax], rsp - - mov rsp, rax ; /* make rsp point to &ar */ - - mov rsi, [rsp+16] ; /* rsi = in */ - mov rdi, [rsp+32] ; /* rdi = out */ - mov r9, [rsp+24] ; /* r9 = last */ - mov r10, [rsp+48] ; /* r10 = end */ - mov rbp, [rsp+64] ; /* rbp = lcode */ - mov r11, [rsp+72] ; /* r11 = dcode */ - mov rdx, [rsp+80] ; /* rdx = hold */ - mov ebx, [rsp+88] ; /* ebx = bits */ - mov r12d, [rsp+100] ; /* r12d = lmask */ - mov r13d, [rsp+104] ; /* r13d = dmask */ - ; /* r14d = len */ - ; /* r15d = dist */ - - - cld - cmp r10, rdi - je L_one_time ; /* if only one decode left */ - cmp r9, rsi - - jne L_do_loop - - -L_one_time: - mov r8, r12 ; /* r8 = lmask */ - cmp bl, 32 - ja L_get_length_code_one_time - - lodsd ; /* eax = *(uint *)in++ */ - mov cl, bl ; /* cl = bits, needs it for shifting */ - add bl, 32 ; /* bits += 32 */ - shl rax, cl - or rdx, rax ; /* hold |= *((uint *)in)++ << bits */ - jmp L_get_length_code_one_time - -ALIGN 4 -L_while_test: - cmp r10, rdi - jbe L_break_loop - cmp r9, rsi - jbe L_break_loop - -L_do_loop: - mov r8, r12 ; /* r8 = lmask */ - cmp bl, 32 - ja L_get_length_code ; /* if (32 < bits) */ - - lodsd ; /* eax = *(uint *)in++ */ - mov cl, bl ; /* cl = bits, needs it for shifting */ - add bl, 32 ; /* bits += 32 */ - shl rax, cl - or rdx, rax ; /* hold |= *((uint *)in)++ << bits */ - -L_get_length_code: - and r8, rdx ; /* r8 &= hold */ - mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */ - - mov cl, ah ; /* cl = this.bits */ - sub bl, ah ; /* bits -= this.bits */ - shr rdx, cl ; /* hold >>= this.bits */ - - test al, al - jnz L_test_for_length_base ; /* if (op != 0) 45.7% */ - - mov r8, r12 ; /* r8 = lmask */ - shr eax, 16 ; /* output this.val char */ - stosb - -L_get_length_code_one_time: - and r8, rdx ; /* r8 &= hold */ - mov eax, [rbp+r8*4] ; /* eax = lcode[hold & lmask] */ - -L_dolen: - mov cl, ah ; /* cl = this.bits */ - sub bl, ah ; /* bits -= this.bits */ - shr rdx, cl ; /* hold >>= this.bits */ - - test al, al - jnz L_test_for_length_base ; /* if (op != 0) 45.7% */ - - shr eax, 16 ; /* output this.val char */ - stosb - jmp L_while_test - -ALIGN 4 -L_test_for_length_base: - mov r14d, eax ; /* len = this */ - shr r14d, 16 ; /* len = this.val */ - mov cl, al - - test al, 16 - jz L_test_for_second_level_length ; /* if ((op & 16) == 0) 8% */ - and cl, 15 ; /* op &= 15 */ - jz L_decode_distance ; /* if (!op) */ - -L_add_bits_to_len: - sub bl, cl - xor eax, eax - inc eax - shl eax, cl - dec eax - and eax, edx ; /* eax &= hold */ - shr rdx, cl - add r14d, eax ; /* len += hold & mask[op] */ - -L_decode_distance: - mov r8, r13 ; /* r8 = dmask */ - cmp bl, 32 - ja L_get_distance_code ; /* if (32 < bits) */ - - lodsd ; /* eax = *(uint *)in++ */ - mov cl, bl ; /* cl = bits, needs it for shifting */ - add bl, 32 ; /* bits += 32 */ - shl rax, cl - or rdx, rax ; /* hold |= *((uint *)in)++ << bits */ - -L_get_distance_code: - and r8, rdx ; /* r8 &= hold */ - mov eax, [r11+r8*4] ; /* eax = dcode[hold & dmask] */ - -L_dodist: - mov r15d, eax ; /* dist = this */ - shr r15d, 16 ; /* dist = this.val */ - mov cl, ah - sub bl, ah ; /* bits -= this.bits */ - shr rdx, cl ; /* hold >>= this.bits */ - mov cl, al ; /* cl = this.op */ - - test al, 16 ; /* if ((op & 16) == 0) */ - jz L_test_for_second_level_dist - and cl, 15 ; /* op &= 15 */ - jz L_check_dist_one - -L_add_bits_to_dist: - sub bl, cl - xor eax, eax - inc eax - shl eax, cl - dec eax ; /* (1 << op) - 1 */ - and eax, edx ; /* eax &= hold */ - shr rdx, cl - add r15d, eax ; /* dist += hold & ((1 << op) - 1) */ - -L_check_window: - mov r8, rsi ; /* save in so from can use it's reg */ - mov rax, rdi - sub rax, [rsp+40] ; /* nbytes = out - beg */ - - cmp eax, r15d - jb L_clip_window ; /* if (dist > nbytes) 4.2% */ - - mov ecx, r14d ; /* ecx = len */ - mov rsi, rdi - sub rsi, r15 ; /* from = out - dist */ - - sar ecx, 1 - jnc L_copy_two ; /* if len % 2 == 0 */ - - rep movsw - mov al, [rsi] - mov [rdi], al - inc rdi - - mov rsi, r8 ; /* move in back to %rsi, toss from */ - jmp L_while_test - -L_copy_two: - rep movsw - mov rsi, r8 ; /* move in back to %rsi, toss from */ - jmp L_while_test - -ALIGN 4 -L_check_dist_one: - cmp r15d, 1 ; /* if dist 1, is a memset */ - jne L_check_window - cmp [rsp+40], rdi ; /* if out == beg, outside window */ - je L_check_window - - mov ecx, r14d ; /* ecx = len */ - mov al, [rdi-1] - mov ah, al - - sar ecx, 1 - jnc L_set_two - mov [rdi], al - inc rdi - -L_set_two: - rep stosw - jmp L_while_test - -ALIGN 4 -L_test_for_second_level_length: - test al, 64 - jnz L_test_for_end_of_block ; /* if ((op & 64) != 0) */ - - xor eax, eax - inc eax - shl eax, cl - dec eax - and eax, edx ; /* eax &= hold */ - add eax, r14d ; /* eax += len */ - mov eax, [rbp+rax*4] ; /* eax = lcode[val+(hold&mask[op])]*/ - jmp L_dolen - -ALIGN 4 -L_test_for_second_level_dist: - test al, 64 - jnz L_invalid_distance_code ; /* if ((op & 64) != 0) */ - - xor eax, eax - inc eax - shl eax, cl - dec eax - and eax, edx ; /* eax &= hold */ - add eax, r15d ; /* eax += dist */ - mov eax, [r11+rax*4] ; /* eax = dcode[val+(hold&mask[op])]*/ - jmp L_dodist - -ALIGN 4 -L_clip_window: - mov ecx, eax ; /* ecx = nbytes */ - mov eax, [rsp+92] ; /* eax = wsize, prepare for dist cmp */ - neg ecx ; /* nbytes = -nbytes */ - - cmp eax, r15d - jb L_invalid_distance_too_far ; /* if (dist > wsize) */ - - add ecx, r15d ; /* nbytes = dist - nbytes */ - cmp dword ptr [rsp+96], 0 - jne L_wrap_around_window ; /* if (write != 0) */ - - mov rsi, [rsp+56] ; /* from = window */ - sub eax, ecx ; /* eax -= nbytes */ - add rsi, rax ; /* from += wsize - nbytes */ - - mov eax, r14d ; /* eax = len */ - cmp r14d, ecx - jbe L_do_copy ; /* if (nbytes >= len) */ - - sub eax, ecx ; /* eax -= nbytes */ - rep movsb - mov rsi, rdi - sub rsi, r15 ; /* from = &out[ -dist ] */ - jmp L_do_copy - -ALIGN 4 -L_wrap_around_window: - mov eax, [rsp+96] ; /* eax = write */ - cmp ecx, eax - jbe L_contiguous_in_window ; /* if (write >= nbytes) */ - - mov esi, [rsp+92] ; /* from = wsize */ - add rsi, [rsp+56] ; /* from += window */ - add rsi, rax ; /* from += write */ - sub rsi, rcx ; /* from -= nbytes */ - sub ecx, eax ; /* nbytes -= write */ - - mov eax, r14d ; /* eax = len */ - cmp eax, ecx - jbe L_do_copy ; /* if (nbytes >= len) */ - - sub eax, ecx ; /* len -= nbytes */ - rep movsb - mov rsi, [rsp+56] ; /* from = window */ - mov ecx, [rsp+96] ; /* nbytes = write */ - cmp eax, ecx - jbe L_do_copy ; /* if (nbytes >= len) */ - - sub eax, ecx ; /* len -= nbytes */ - rep movsb - mov rsi, rdi - sub rsi, r15 ; /* from = out - dist */ - jmp L_do_copy - -ALIGN 4 -L_contiguous_in_window: - mov rsi, [rsp+56] ; /* rsi = window */ - add rsi, rax - sub rsi, rcx ; /* from += write - nbytes */ - - mov eax, r14d ; /* eax = len */ - cmp eax, ecx - jbe L_do_copy ; /* if (nbytes >= len) */ - - sub eax, ecx ; /* len -= nbytes */ - rep movsb - mov rsi, rdi - sub rsi, r15 ; /* from = out - dist */ - jmp L_do_copy ; /* if (nbytes >= len) */ - -ALIGN 4 -L_do_copy: - mov ecx, eax ; /* ecx = len */ - rep movsb - - mov rsi, r8 ; /* move in back to %esi, toss from */ - jmp L_while_test - -L_test_for_end_of_block: - test al, 32 - jz L_invalid_literal_length_code - mov dword ptr [rsp+116], 1 - jmp L_break_loop_with_status - -L_invalid_literal_length_code: - mov dword ptr [rsp+116], 2 - jmp L_break_loop_with_status - -L_invalid_distance_code: - mov dword ptr [rsp+116], 3 - jmp L_break_loop_with_status - -L_invalid_distance_too_far: - mov dword ptr [rsp+116], 4 - jmp L_break_loop_with_status - -L_break_loop: - mov dword ptr [rsp+116], 0 - -L_break_loop_with_status: -; /* put in, out, bits, and hold back into ar and pop esp */ - mov [rsp+16], rsi ; /* in */ - mov [rsp+32], rdi ; /* out */ - mov [rsp+88], ebx ; /* bits */ - mov [rsp+80], rdx ; /* hold */ - - mov rax, [rsp] ; /* restore rbp and rsp */ - mov rbp, [rsp+8] - mov rsp, rax - - - - mov rsi,[rsp-8] - mov rdi,[rsp-16] - mov r12,[rsp-24] - mov r13,[rsp-32] - mov r14,[rsp-40] - mov r15,[rsp-48] - mov rbx,[rsp-56] - - ret 0 -; : -; : "m" (ar) -; : "memory", "%rax", "%rbx", "%rcx", "%rdx", "%rsi", "%rdi", -; "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" -; ); - -inffas8664fnc ENDP -;_TEXT ENDS -END diff --git a/contrib/masmx64/readme.txt b/contrib/masmx64/readme.txt deleted file mode 100644 index 2da6733..0000000 --- a/contrib/masmx64/readme.txt +++ /dev/null @@ -1,31 +0,0 @@ -Summary -------- -This directory contains ASM implementations of the functions -longest_match() and inflate_fast(), for 64 bits x86 (both AMD64 and Intel EM64t), -for use with Microsoft Macro Assembler (x64) for AMD64 and Microsoft C++ 64 bits. - -gvmat64.asm is written by Gilles Vollant (2005), by using Brian Raiter 686/32 bits - assembly optimized version from Jean-loup Gailly original longest_match function - -inffasx64.asm and inffas8664.c were written by Chris Anderson, by optimizing - original function from Mark Adler - -Use instructions ----------------- -Assemble the .asm files using MASM and put the object files into the zlib source -directory. You can also get object files here: - - http://www.winimage.com/zLibDll/zlib124_masm_obj.zip - -define ASMV and ASMINF in your project. Include inffas8664.c in your source tree, -and inffasx64.obj and gvmat64.obj as object to link. - - -Build instructions ------------------- -run bld_64.bat with Microsoft Macro Assembler (x64) for AMD64 (ml64.exe) - -ml64.exe is given with Visual Studio 2005, Windows 2003 server DDK - -You can get Windows 2003 server DDK with ml64 and cl for AMD64 from - http://www.microsoft.com/whdc/devtools/ddk/default.mspx for low price) diff --git a/contrib/masmx86/bld_ml32.bat b/contrib/masmx86/bld_ml32.bat deleted file mode 100644 index e1b86bf..0000000 --- a/contrib/masmx86/bld_ml32.bat +++ /dev/null @@ -1,2 +0,0 @@ -ml /coff /Zi /c /Flmatch686.lst match686.asm -ml /coff /Zi /c /Flinffas32.lst inffas32.asm diff --git a/contrib/masmx86/inffas32.asm b/contrib/masmx86/inffas32.asm deleted file mode 100644 index 03d20f8..0000000 --- a/contrib/masmx86/inffas32.asm +++ /dev/null @@ -1,1080 +0,0 @@ -;/* inffas32.asm is a hand tuned assembler version of inffast.c -- fast decoding -; * -; * inffas32.asm is derivated from inffas86.c, with translation of assembly code -; * -; * Copyright (C) 1995-2003 Mark Adler -; * For conditions of distribution and use, see copyright notice in zlib.h -; * -; * Copyright (C) 2003 Chris Anderson -; * Please use the copyright conditions above. -; * -; * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from -; * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at -; * the moment. I have successfully compiled and tested this code with gcc2.96, -; * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S -; * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX -; * enabled. I will attempt to merge the MMX code into this version. Newer -; * versions of this and inffast.S can be found at -; * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/ -; * -; * 2005 : modification by Gilles Vollant -; */ -; For Visual C++ 4.x and higher and ML 6.x and higher -; ml.exe is in directory \MASM611C of Win95 DDK -; ml.exe is also distributed in http://www.masm32.com/masmdl.htm -; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/ -; -; -; compile with command line option -; ml /coff /Zi /c /Flinffas32.lst inffas32.asm - -; if you define NO_GZIP (see inflate.h), compile with -; ml /coff /Zi /c /Flinffas32.lst /DNO_GUNZIP inffas32.asm - - -; zlib122sup is 0 fort zlib 1.2.2.1 and lower -; zlib122sup is 8 fort zlib 1.2.2.2 and more (with addition of dmax and head -; in inflate_state in inflate.h) -zlib1222sup equ 8 - - -IFDEF GUNZIP - INFLATE_MODE_TYPE equ 11 - INFLATE_MODE_BAD equ 26 -ELSE - IFNDEF NO_GUNZIP - INFLATE_MODE_TYPE equ 11 - INFLATE_MODE_BAD equ 26 - ELSE - INFLATE_MODE_TYPE equ 3 - INFLATE_MODE_BAD equ 17 - ENDIF -ENDIF - - -; 75 "inffast.S" -;FILE "inffast.S" - -;;;GLOBAL _inflate_fast - -;;;SECTION .text - - - - .586p - .mmx - - name inflate_fast_x86 - .MODEL FLAT - -_DATA segment -inflate_fast_use_mmx: - dd 1 - - -_TEXT segment - - - -ALIGN 4 - db 'Fast decoding Code from Chris Anderson' - db 0 - -ALIGN 4 -invalid_literal_length_code_msg: - db 'invalid literal/length code' - db 0 - -ALIGN 4 -invalid_distance_code_msg: - db 'invalid distance code' - db 0 - -ALIGN 4 -invalid_distance_too_far_msg: - db 'invalid distance too far back' - db 0 - - -ALIGN 4 -inflate_fast_mask: -dd 0 -dd 1 -dd 3 -dd 7 -dd 15 -dd 31 -dd 63 -dd 127 -dd 255 -dd 511 -dd 1023 -dd 2047 -dd 4095 -dd 8191 -dd 16383 -dd 32767 -dd 65535 -dd 131071 -dd 262143 -dd 524287 -dd 1048575 -dd 2097151 -dd 4194303 -dd 8388607 -dd 16777215 -dd 33554431 -dd 67108863 -dd 134217727 -dd 268435455 -dd 536870911 -dd 1073741823 -dd 2147483647 -dd 4294967295 - - -mode_state equ 0 ;/* state->mode */ -wsize_state equ (32+zlib1222sup) ;/* state->wsize */ -write_state equ (36+4+zlib1222sup) ;/* state->write */ -window_state equ (40+4+zlib1222sup) ;/* state->window */ -hold_state equ (44+4+zlib1222sup) ;/* state->hold */ -bits_state equ (48+4+zlib1222sup) ;/* state->bits */ -lencode_state equ (64+4+zlib1222sup) ;/* state->lencode */ -distcode_state equ (68+4+zlib1222sup) ;/* state->distcode */ -lenbits_state equ (72+4+zlib1222sup) ;/* state->lenbits */ -distbits_state equ (76+4+zlib1222sup) ;/* state->distbits */ - - -;;SECTION .text -; 205 "inffast.S" -;GLOBAL inflate_fast_use_mmx - -;SECTION .data - - -; GLOBAL inflate_fast_use_mmx:object -;.size inflate_fast_use_mmx, 4 -; 226 "inffast.S" -;SECTION .text - -ALIGN 4 -_inflate_fast proc near -.FPO (16, 4, 0, 0, 1, 0) - push edi - push esi - push ebp - push ebx - pushfd - sub esp,64 - cld - - - - - mov esi, [esp+88] - mov edi, [esi+28] - - - - - - - - mov edx, [esi+4] - mov eax, [esi+0] - - add edx,eax - sub edx,11 - - mov [esp+44],eax - mov [esp+20],edx - - mov ebp, [esp+92] - mov ecx, [esi+16] - mov ebx, [esi+12] - - sub ebp,ecx - neg ebp - add ebp,ebx - - sub ecx,257 - add ecx,ebx - - mov [esp+60],ebx - mov [esp+40],ebp - mov [esp+16],ecx -; 285 "inffast.S" - mov eax, [edi+lencode_state] - mov ecx, [edi+distcode_state] - - mov [esp+8],eax - mov [esp+12],ecx - - mov eax,1 - mov ecx, [edi+lenbits_state] - shl eax,cl - dec eax - mov [esp+0],eax - - mov eax,1 - mov ecx, [edi+distbits_state] - shl eax,cl - dec eax - mov [esp+4],eax - - mov eax, [edi+wsize_state] - mov ecx, [edi+write_state] - mov edx, [edi+window_state] - - mov [esp+52],eax - mov [esp+48],ecx - mov [esp+56],edx - - mov ebp, [edi+hold_state] - mov ebx, [edi+bits_state] -; 321 "inffast.S" - mov esi, [esp+44] - mov ecx, [esp+20] - cmp ecx,esi - ja L_align_long - - add ecx,11 - sub ecx,esi - mov eax,12 - sub eax,ecx - lea edi, [esp+28] - rep movsb - mov ecx,eax - xor eax,eax - rep stosb - lea esi, [esp+28] - mov [esp+20],esi - jmp L_is_aligned - - -L_align_long: - test esi,3 - jz L_is_aligned - xor eax,eax - mov al, [esi] - inc esi - mov ecx,ebx - add ebx,8 - shl eax,cl - or ebp,eax - jmp L_align_long - -L_is_aligned: - mov edi, [esp+60] -; 366 "inffast.S" -L_check_mmx: - cmp dword ptr [inflate_fast_use_mmx],2 - je L_init_mmx - ja L_do_loop - - push eax - push ebx - push ecx - push edx - pushfd - mov eax, [esp] - xor dword ptr [esp],0200000h - - - - - popfd - pushfd - pop edx - xor edx,eax - jz L_dont_use_mmx - xor eax,eax - cpuid - cmp ebx,0756e6547h - jne L_dont_use_mmx - cmp ecx,06c65746eh - jne L_dont_use_mmx - cmp edx,049656e69h - jne L_dont_use_mmx - mov eax,1 - cpuid - shr eax,8 - and eax,15 - cmp eax,6 - jne L_dont_use_mmx - test edx,0800000h - jnz L_use_mmx - jmp L_dont_use_mmx -L_use_mmx: - mov dword ptr [inflate_fast_use_mmx],2 - jmp L_check_mmx_pop -L_dont_use_mmx: - mov dword ptr [inflate_fast_use_mmx],3 -L_check_mmx_pop: - pop edx - pop ecx - pop ebx - pop eax - jmp L_check_mmx -; 426 "inffast.S" -ALIGN 4 -L_do_loop: -; 437 "inffast.S" - cmp bl,15 - ja L_get_length_code - - xor eax,eax - lodsw - mov cl,bl - add bl,16 - shl eax,cl - or ebp,eax - -L_get_length_code: - mov edx, [esp+0] - mov ecx, [esp+8] - and edx,ebp - mov eax, [ecx+edx*4] - -L_dolen: - - - - - - - mov cl,ah - sub bl,ah - shr ebp,cl - - - - - - - test al,al - jnz L_test_for_length_base - - shr eax,16 - stosb - -L_while_test: - - - cmp [esp+16],edi - jbe L_break_loop - - cmp [esp+20],esi - ja L_do_loop - jmp L_break_loop - -L_test_for_length_base: -; 502 "inffast.S" - mov edx,eax - shr edx,16 - mov cl,al - - test al,16 - jz L_test_for_second_level_length - and cl,15 - jz L_save_len - cmp bl,cl - jae L_add_bits_to_len - - mov ch,cl - xor eax,eax - lodsw - mov cl,bl - add bl,16 - shl eax,cl - or ebp,eax - mov cl,ch - -L_add_bits_to_len: - mov eax,1 - shl eax,cl - dec eax - sub bl,cl - and eax,ebp - shr ebp,cl - add edx,eax - -L_save_len: - mov [esp+24],edx - - -L_decode_distance: -; 549 "inffast.S" - cmp bl,15 - ja L_get_distance_code - - xor eax,eax - lodsw - mov cl,bl - add bl,16 - shl eax,cl - or ebp,eax - -L_get_distance_code: - mov edx, [esp+4] - mov ecx, [esp+12] - and edx,ebp - mov eax, [ecx+edx*4] - - -L_dodist: - mov edx,eax - shr edx,16 - mov cl,ah - sub bl,ah - shr ebp,cl -; 584 "inffast.S" - mov cl,al - - test al,16 - jz L_test_for_second_level_dist - and cl,15 - jz L_check_dist_one - cmp bl,cl - jae L_add_bits_to_dist - - mov ch,cl - xor eax,eax - lodsw - mov cl,bl - add bl,16 - shl eax,cl - or ebp,eax - mov cl,ch - -L_add_bits_to_dist: - mov eax,1 - shl eax,cl - dec eax - sub bl,cl - and eax,ebp - shr ebp,cl - add edx,eax - jmp L_check_window - -L_check_window: -; 625 "inffast.S" - mov [esp+44],esi - mov eax,edi - sub eax, [esp+40] - - cmp eax,edx - jb L_clip_window - - mov ecx, [esp+24] - mov esi,edi - sub esi,edx - - sub ecx,3 - mov al, [esi] - mov [edi],al - mov al, [esi+1] - mov dl, [esi+2] - add esi,3 - mov [edi+1],al - mov [edi+2],dl - add edi,3 - rep movsb - - mov esi, [esp+44] - jmp L_while_test - -ALIGN 4 -L_check_dist_one: - cmp edx,1 - jne L_check_window - cmp [esp+40],edi - je L_check_window - - dec edi - mov ecx, [esp+24] - mov al, [edi] - sub ecx,3 - - mov [edi+1],al - mov [edi+2],al - mov [edi+3],al - add edi,4 - rep stosb - - jmp L_while_test - -ALIGN 4 -L_test_for_second_level_length: - - - - - test al,64 - jnz L_test_for_end_of_block - - mov eax,1 - shl eax,cl - dec eax - and eax,ebp - add eax,edx - mov edx, [esp+8] - mov eax, [edx+eax*4] - jmp L_dolen - -ALIGN 4 -L_test_for_second_level_dist: - - - - - test al,64 - jnz L_invalid_distance_code - - mov eax,1 - shl eax,cl - dec eax - and eax,ebp - add eax,edx - mov edx, [esp+12] - mov eax, [edx+eax*4] - jmp L_dodist - -ALIGN 4 -L_clip_window: -; 721 "inffast.S" - mov ecx,eax - mov eax, [esp+52] - neg ecx - mov esi, [esp+56] - - cmp eax,edx - jb L_invalid_distance_too_far - - add ecx,edx - cmp dword ptr [esp+48],0 - jne L_wrap_around_window - - sub eax,ecx - add esi,eax -; 749 "inffast.S" - mov eax, [esp+24] - cmp eax,ecx - jbe L_do_copy1 - - sub eax,ecx - rep movsb - mov esi,edi - sub esi,edx - jmp L_do_copy1 - - cmp eax,ecx - jbe L_do_copy1 - - sub eax,ecx - rep movsb - mov esi,edi - sub esi,edx - jmp L_do_copy1 - -L_wrap_around_window: -; 793 "inffast.S" - mov eax, [esp+48] - cmp ecx,eax - jbe L_contiguous_in_window - - add esi, [esp+52] - add esi,eax - sub esi,ecx - sub ecx,eax - - - mov eax, [esp+24] - cmp eax,ecx - jbe L_do_copy1 - - sub eax,ecx - rep movsb - mov esi, [esp+56] - mov ecx, [esp+48] - cmp eax,ecx - jbe L_do_copy1 - - sub eax,ecx - rep movsb - mov esi,edi - sub esi,edx - jmp L_do_copy1 - -L_contiguous_in_window: -; 836 "inffast.S" - add esi,eax - sub esi,ecx - - - mov eax, [esp+24] - cmp eax,ecx - jbe L_do_copy1 - - sub eax,ecx - rep movsb - mov esi,edi - sub esi,edx - -L_do_copy1: -; 862 "inffast.S" - mov ecx,eax - rep movsb - - mov esi, [esp+44] - jmp L_while_test -; 878 "inffast.S" -ALIGN 4 -L_init_mmx: - emms - - - - - - movd mm0,ebp - mov ebp,ebx -; 896 "inffast.S" - movd mm4,dword ptr [esp+0] - movq mm3,mm4 - movd mm5,dword ptr [esp+4] - movq mm2,mm5 - pxor mm1,mm1 - mov ebx, [esp+8] - jmp L_do_loop_mmx - -ALIGN 4 -L_do_loop_mmx: - psrlq mm0,mm1 - - cmp ebp,32 - ja L_get_length_code_mmx - - movd mm6,ebp - movd mm7,dword ptr [esi] - add esi,4 - psllq mm7,mm6 - add ebp,32 - por mm0,mm7 - -L_get_length_code_mmx: - pand mm4,mm0 - movd eax,mm4 - movq mm4,mm3 - mov eax, [ebx+eax*4] - -L_dolen_mmx: - movzx ecx,ah - movd mm1,ecx - sub ebp,ecx - - test al,al - jnz L_test_for_length_base_mmx - - shr eax,16 - stosb - -L_while_test_mmx: - - - cmp [esp+16],edi - jbe L_break_loop - - cmp [esp+20],esi - ja L_do_loop_mmx - jmp L_break_loop - -L_test_for_length_base_mmx: - - mov edx,eax - shr edx,16 - - test al,16 - jz L_test_for_second_level_length_mmx - and eax,15 - jz L_decode_distance_mmx - - psrlq mm0,mm1 - movd mm1,eax - movd ecx,mm0 - sub ebp,eax - and ecx, [inflate_fast_mask+eax*4] - add edx,ecx - -L_decode_distance_mmx: - psrlq mm0,mm1 - - cmp ebp,32 - ja L_get_dist_code_mmx - - movd mm6,ebp - movd mm7,dword ptr [esi] - add esi,4 - psllq mm7,mm6 - add ebp,32 - por mm0,mm7 - -L_get_dist_code_mmx: - mov ebx, [esp+12] - pand mm5,mm0 - movd eax,mm5 - movq mm5,mm2 - mov eax, [ebx+eax*4] - -L_dodist_mmx: - - movzx ecx,ah - mov ebx,eax - shr ebx,16 - sub ebp,ecx - movd mm1,ecx - - test al,16 - jz L_test_for_second_level_dist_mmx - and eax,15 - jz L_check_dist_one_mmx - -L_add_bits_to_dist_mmx: - psrlq mm0,mm1 - movd mm1,eax - movd ecx,mm0 - sub ebp,eax - and ecx, [inflate_fast_mask+eax*4] - add ebx,ecx - -L_check_window_mmx: - mov [esp+44],esi - mov eax,edi - sub eax, [esp+40] - - cmp eax,ebx - jb L_clip_window_mmx - - mov ecx,edx - mov esi,edi - sub esi,ebx - - sub ecx,3 - mov al, [esi] - mov [edi],al - mov al, [esi+1] - mov dl, [esi+2] - add esi,3 - mov [edi+1],al - mov [edi+2],dl - add edi,3 - rep movsb - - mov esi, [esp+44] - mov ebx, [esp+8] - jmp L_while_test_mmx - -ALIGN 4 -L_check_dist_one_mmx: - cmp ebx,1 - jne L_check_window_mmx - cmp [esp+40],edi - je L_check_window_mmx - - dec edi - mov ecx,edx - mov al, [edi] - sub ecx,3 - - mov [edi+1],al - mov [edi+2],al - mov [edi+3],al - add edi,4 - rep stosb - - mov ebx, [esp+8] - jmp L_while_test_mmx - -ALIGN 4 -L_test_for_second_level_length_mmx: - test al,64 - jnz L_test_for_end_of_block - - and eax,15 - psrlq mm0,mm1 - movd ecx,mm0 - and ecx, [inflate_fast_mask+eax*4] - add ecx,edx - mov eax, [ebx+ecx*4] - jmp L_dolen_mmx - -ALIGN 4 -L_test_for_second_level_dist_mmx: - test al,64 - jnz L_invalid_distance_code - - and eax,15 - psrlq mm0,mm1 - movd ecx,mm0 - and ecx, [inflate_fast_mask+eax*4] - mov eax, [esp+12] - add ecx,ebx - mov eax, [eax+ecx*4] - jmp L_dodist_mmx - -ALIGN 4 -L_clip_window_mmx: - - mov ecx,eax - mov eax, [esp+52] - neg ecx - mov esi, [esp+56] - - cmp eax,ebx - jb L_invalid_distance_too_far - - add ecx,ebx - cmp dword ptr [esp+48],0 - jne L_wrap_around_window_mmx - - sub eax,ecx - add esi,eax - - cmp edx,ecx - jbe L_do_copy1_mmx - - sub edx,ecx - rep movsb - mov esi,edi - sub esi,ebx - jmp L_do_copy1_mmx - - cmp edx,ecx - jbe L_do_copy1_mmx - - sub edx,ecx - rep movsb - mov esi,edi - sub esi,ebx - jmp L_do_copy1_mmx - -L_wrap_around_window_mmx: - - mov eax, [esp+48] - cmp ecx,eax - jbe L_contiguous_in_window_mmx - - add esi, [esp+52] - add esi,eax - sub esi,ecx - sub ecx,eax - - - cmp edx,ecx - jbe L_do_copy1_mmx - - sub edx,ecx - rep movsb - mov esi, [esp+56] - mov ecx, [esp+48] - cmp edx,ecx - jbe L_do_copy1_mmx - - sub edx,ecx - rep movsb - mov esi,edi - sub esi,ebx - jmp L_do_copy1_mmx - -L_contiguous_in_window_mmx: - - add esi,eax - sub esi,ecx - - - cmp edx,ecx - jbe L_do_copy1_mmx - - sub edx,ecx - rep movsb - mov esi,edi - sub esi,ebx - -L_do_copy1_mmx: - - - mov ecx,edx - rep movsb - - mov esi, [esp+44] - mov ebx, [esp+8] - jmp L_while_test_mmx -; 1174 "inffast.S" -L_invalid_distance_code: - - - - - - mov ecx, invalid_distance_code_msg - mov edx,INFLATE_MODE_BAD - jmp L_update_stream_state - -L_test_for_end_of_block: - - - - - - test al,32 - jz L_invalid_literal_length_code - - mov ecx,0 - mov edx,INFLATE_MODE_TYPE - jmp L_update_stream_state - -L_invalid_literal_length_code: - - - - - - mov ecx, invalid_literal_length_code_msg - mov edx,INFLATE_MODE_BAD - jmp L_update_stream_state - -L_invalid_distance_too_far: - - - - mov esi, [esp+44] - mov ecx, invalid_distance_too_far_msg - mov edx,INFLATE_MODE_BAD - jmp L_update_stream_state - -L_update_stream_state: - - mov eax, [esp+88] - test ecx,ecx - jz L_skip_msg - mov [eax+24],ecx -L_skip_msg: - mov eax, [eax+28] - mov [eax+mode_state],edx - jmp L_break_loop - -ALIGN 4 -L_break_loop: -; 1243 "inffast.S" - cmp dword ptr [inflate_fast_use_mmx],2 - jne L_update_next_in - - - - mov ebx,ebp - -L_update_next_in: -; 1266 "inffast.S" - mov eax, [esp+88] - mov ecx,ebx - mov edx, [eax+28] - shr ecx,3 - sub esi,ecx - shl ecx,3 - sub ebx,ecx - mov [eax+12],edi - mov [edx+bits_state],ebx - mov ecx,ebx - - lea ebx, [esp+28] - cmp [esp+20],ebx - jne L_buf_not_used - - sub esi,ebx - mov ebx, [eax+0] - mov [esp+20],ebx - add esi,ebx - mov ebx, [eax+4] - sub ebx,11 - add [esp+20],ebx - -L_buf_not_used: - mov [eax+0],esi - - mov ebx,1 - shl ebx,cl - dec ebx - - - - - - cmp dword ptr [inflate_fast_use_mmx],2 - jne L_update_hold - - - - psrlq mm0,mm1 - movd ebp,mm0 - - emms - -L_update_hold: - - - - and ebp,ebx - mov [edx+hold_state],ebp - - - - - mov ebx, [esp+20] - cmp ebx,esi - jbe L_last_is_smaller - - sub ebx,esi - add ebx,11 - mov [eax+4],ebx - jmp L_fixup_out -L_last_is_smaller: - sub esi,ebx - neg esi - add esi,11 - mov [eax+4],esi - - - - -L_fixup_out: - - mov ebx, [esp+16] - cmp ebx,edi - jbe L_end_is_smaller - - sub ebx,edi - add ebx,257 - mov [eax+16],ebx - jmp L_done -L_end_is_smaller: - sub edi,ebx - neg edi - add edi,257 - mov [eax+16],edi - - - - - -L_done: - add esp,64 - popfd - pop ebx - pop ebp - pop esi - pop edi - ret -_inflate_fast endp - -_TEXT ends -end diff --git a/contrib/masmx86/match686.asm b/contrib/masmx86/match686.asm deleted file mode 100644 index 3b09212..0000000 --- a/contrib/masmx86/match686.asm +++ /dev/null @@ -1,479 +0,0 @@ -; match686.asm -- Asm portion of the optimized longest_match for 32 bits x86 -; Copyright (C) 1995-1996 Jean-loup Gailly, Brian Raiter and Gilles Vollant. -; File written by Gilles Vollant, by converting match686.S from Brian Raiter -; for MASM. This is as assembly version of longest_match -; from Jean-loup Gailly in deflate.c -; -; http://www.zlib.net -; http://www.winimage.com/zLibDll -; http://www.muppetlabs.com/~breadbox/software/assembly.html -; -; For Visual C++ 4.x and higher and ML 6.x and higher -; ml.exe is distributed in -; http://www.microsoft.com/downloads/details.aspx?FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64 -; -; this file contain two implementation of longest_match -; -; this longest_match was written by Brian raiter (1998), optimized for Pentium Pro -; (and the faster known version of match_init on modern Core 2 Duo and AMD Phenom) -; -; for using an assembly version of longest_match, you need define ASMV in project -; -; compile the asm file running -; ml /coff /Zi /c /Flmatch686.lst match686.asm -; and do not include match686.obj in your project -; -; note: contrib of zLib 1.2.3 and earlier contained both a deprecated version for -; Pentium (prior Pentium Pro) and this version for Pentium Pro and modern processor -; with autoselect (with cpu detection code) -; if you want support the old pentium optimization, you can still use these version -; -; this file is not optimized for old pentium, but it compatible with all x86 32 bits -; processor (starting 80386) -; -; -; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2 - -;uInt longest_match(s, cur_match) -; deflate_state *s; -; IPos cur_match; /* current match */ - - NbStack equ 76 - cur_match equ dword ptr[esp+NbStack-0] - str_s equ dword ptr[esp+NbStack-4] -; 5 dword on top (ret,ebp,esi,edi,ebx) - adrret equ dword ptr[esp+NbStack-8] - pushebp equ dword ptr[esp+NbStack-12] - pushedi equ dword ptr[esp+NbStack-16] - pushesi equ dword ptr[esp+NbStack-20] - pushebx equ dword ptr[esp+NbStack-24] - - chain_length equ dword ptr [esp+NbStack-28] - limit equ dword ptr [esp+NbStack-32] - best_len equ dword ptr [esp+NbStack-36] - window equ dword ptr [esp+NbStack-40] - prev equ dword ptr [esp+NbStack-44] - scan_start equ word ptr [esp+NbStack-48] - wmask equ dword ptr [esp+NbStack-52] - match_start_ptr equ dword ptr [esp+NbStack-56] - nice_match equ dword ptr [esp+NbStack-60] - scan equ dword ptr [esp+NbStack-64] - - windowlen equ dword ptr [esp+NbStack-68] - match_start equ dword ptr [esp+NbStack-72] - strend equ dword ptr [esp+NbStack-76] - NbStackAdd equ (NbStack-24) - - .386p - - name gvmatch - .MODEL FLAT - - - -; all the +zlib1222add offsets are due to the addition of fields -; in zlib in the deflate_state structure since the asm code was first written -; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). -; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). -; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). - - zlib1222add equ 8 - -; Note : these value are good with a 8 bytes boundary pack structure - dep_chain_length equ 74h+zlib1222add - dep_window equ 30h+zlib1222add - dep_strstart equ 64h+zlib1222add - dep_prev_length equ 70h+zlib1222add - dep_nice_match equ 88h+zlib1222add - dep_w_size equ 24h+zlib1222add - dep_prev equ 38h+zlib1222add - dep_w_mask equ 2ch+zlib1222add - dep_good_match equ 84h+zlib1222add - dep_match_start equ 68h+zlib1222add - dep_lookahead equ 6ch+zlib1222add - - -_TEXT segment - -IFDEF NOUNDERLINE - public longest_match - public match_init -ELSE - public _longest_match - public _match_init -ENDIF - - MAX_MATCH equ 258 - MIN_MATCH equ 3 - MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) - - - -MAX_MATCH equ 258 -MIN_MATCH equ 3 -MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1) -MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h) - - -;;; stack frame offsets - -chainlenwmask equ esp + 0 ; high word: current chain len - ; low word: s->wmask -window equ esp + 4 ; local copy of s->window -windowbestlen equ esp + 8 ; s->window + bestlen -scanstart equ esp + 16 ; first two bytes of string -scanend equ esp + 12 ; last two bytes of string -scanalign equ esp + 20 ; dword-misalignment of string -nicematch equ esp + 24 ; a good enough match size -bestlen equ esp + 28 ; size of best match so far -scan equ esp + 32 ; ptr to string wanting match - -LocalVarsSize equ 36 -; saved ebx byte esp + 36 -; saved edi byte esp + 40 -; saved esi byte esp + 44 -; saved ebp byte esp + 48 -; return address byte esp + 52 -deflatestate equ esp + 56 ; the function arguments -curmatch equ esp + 60 - -;;; Offsets for fields in the deflate_state structure. These numbers -;;; are calculated from the definition of deflate_state, with the -;;; assumption that the compiler will dword-align the fields. (Thus, -;;; changing the definition of deflate_state could easily cause this -;;; program to crash horribly, without so much as a warning at -;;; compile time. Sigh.) - -dsWSize equ 36+zlib1222add -dsWMask equ 44+zlib1222add -dsWindow equ 48+zlib1222add -dsPrev equ 56+zlib1222add -dsMatchLen equ 88+zlib1222add -dsPrevMatch equ 92+zlib1222add -dsStrStart equ 100+zlib1222add -dsMatchStart equ 104+zlib1222add -dsLookahead equ 108+zlib1222add -dsPrevLen equ 112+zlib1222add -dsMaxChainLen equ 116+zlib1222add -dsGoodMatch equ 132+zlib1222add -dsNiceMatch equ 136+zlib1222add - - -;;; match686.asm -- Pentium-Pro-optimized version of longest_match() -;;; Written for zlib 1.1.2 -;;; Copyright (C) 1998 Brian Raiter -;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html -;;; -;; -;; This software is provided 'as-is', without any express or implied -;; warranty. In no event will the authors be held liable for any damages -;; arising from the use of this software. -;; -;; Permission is granted to anyone to use this software for any purpose, -;; including commercial applications, and to alter it and redistribute it -;; freely, subject to the following restrictions: -;; -;; 1. The origin of this software must not be misrepresented; you must not -;; claim that you wrote the original software. If you use this software -;; in a product, an acknowledgment in the product documentation would be -;; appreciated but is not required. -;; 2. Altered source versions must be plainly marked as such, and must not be -;; misrepresented as being the original software -;; 3. This notice may not be removed or altered from any source distribution. -;; - -;GLOBAL _longest_match, _match_init - - -;SECTION .text - -;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch) - -;_longest_match: - IFDEF NOUNDERLINE - longest_match proc near - ELSE - _longest_match proc near - ENDIF -.FPO (9, 4, 0, 0, 1, 0) - -;;; Save registers that the compiler may be using, and adjust esp to -;;; make room for our stack frame. - - push ebp - push edi - push esi - push ebx - sub esp, LocalVarsSize - -;;; Retrieve the function arguments. ecx will hold cur_match -;;; throughout the entire function. edx will hold the pointer to the -;;; deflate_state structure during the function's setup (before -;;; entering the main loop. - - mov edx, [deflatestate] - mov ecx, [curmatch] - -;;; uInt wmask = s->w_mask; -;;; unsigned chain_length = s->max_chain_length; -;;; if (s->prev_length >= s->good_match) { -;;; chain_length >>= 2; -;;; } - - mov eax, [edx + dsPrevLen] - mov ebx, [edx + dsGoodMatch] - cmp eax, ebx - mov eax, [edx + dsWMask] - mov ebx, [edx + dsMaxChainLen] - jl LastMatchGood - shr ebx, 2 -LastMatchGood: - -;;; chainlen is decremented once beforehand so that the function can -;;; use the sign flag instead of the zero flag for the exit test. -;;; It is then shifted into the high word, to make room for the wmask -;;; value, which it will always accompany. - - dec ebx - shl ebx, 16 - or ebx, eax - mov [chainlenwmask], ebx - -;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; - - mov eax, [edx + dsNiceMatch] - mov ebx, [edx + dsLookahead] - cmp ebx, eax - jl LookaheadLess - mov ebx, eax -LookaheadLess: mov [nicematch], ebx - -;;; register Bytef *scan = s->window + s->strstart; - - mov esi, [edx + dsWindow] - mov [window], esi - mov ebp, [edx + dsStrStart] - lea edi, [esi + ebp] - mov [scan], edi - -;;; Determine how many bytes the scan ptr is off from being -;;; dword-aligned. - - mov eax, edi - neg eax - and eax, 3 - mov [scanalign], eax - -;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? -;;; s->strstart - (IPos)MAX_DIST(s) : NIL; - - mov eax, [edx + dsWSize] - sub eax, MIN_LOOKAHEAD - sub ebp, eax - jg LimitPositive - xor ebp, ebp -LimitPositive: - -;;; int best_len = s->prev_length; - - mov eax, [edx + dsPrevLen] - mov [bestlen], eax - -;;; Store the sum of s->window + best_len in esi locally, and in esi. - - add esi, eax - mov [windowbestlen], esi - -;;; register ush scan_start = *(ushf*)scan; -;;; register ush scan_end = *(ushf*)(scan+best_len-1); -;;; Posf *prev = s->prev; - - movzx ebx, word ptr [edi] - mov [scanstart], ebx - movzx ebx, word ptr [edi + eax - 1] - mov [scanend], ebx - mov edi, [edx + dsPrev] - -;;; Jump into the main loop. - - mov edx, [chainlenwmask] - jmp short LoopEntry - -align 4 - -;;; do { -;;; match = s->window + cur_match; -;;; if (*(ushf*)(match+best_len-1) != scan_end || -;;; *(ushf*)match != scan_start) continue; -;;; [...] -;;; } while ((cur_match = prev[cur_match & wmask]) > limit -;;; && --chain_length != 0); -;;; -;;; Here is the inner loop of the function. The function will spend the -;;; majority of its time in this loop, and majority of that time will -;;; be spent in the first ten instructions. -;;; -;;; Within this loop: -;;; ebx = scanend -;;; ecx = curmatch -;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) -;;; esi = windowbestlen - i.e., (window + bestlen) -;;; edi = prev -;;; ebp = limit - -LookupLoop: - and ecx, edx - movzx ecx, word ptr [edi + ecx*2] - cmp ecx, ebp - jbe LeaveNow - sub edx, 00010000h - js LeaveNow -LoopEntry: movzx eax, word ptr [esi + ecx - 1] - cmp eax, ebx - jnz LookupLoop - mov eax, [window] - movzx eax, word ptr [eax + ecx] - cmp eax, [scanstart] - jnz LookupLoop - -;;; Store the current value of chainlen. - - mov [chainlenwmask], edx - -;;; Point edi to the string under scrutiny, and esi to the string we -;;; are hoping to match it up with. In actuality, esi and edi are -;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is -;;; initialized to -(MAX_MATCH_8 - scanalign). - - mov esi, [window] - mov edi, [scan] - add esi, ecx - mov eax, [scanalign] - mov edx, 0fffffef8h; -(MAX_MATCH_8) - lea edi, [edi + eax + 0108h] ;MAX_MATCH_8] - lea esi, [esi + eax + 0108h] ;MAX_MATCH_8] - -;;; Test the strings for equality, 8 bytes at a time. At the end, -;;; adjust edx so that it is offset to the exact byte that mismatched. -;;; -;;; We already know at this point that the first three bytes of the -;;; strings match each other, and they can be safely passed over before -;;; starting the compare loop. So what this code does is skip over 0-3 -;;; bytes, as much as necessary in order to dword-align the edi -;;; pointer. (esi will still be misaligned three times out of four.) -;;; -;;; It should be confessed that this loop usually does not represent -;;; much of the total running time. Replacing it with a more -;;; straightforward "rep cmpsb" would not drastically degrade -;;; performance. - -LoopCmps: - mov eax, [esi + edx] - xor eax, [edi + edx] - jnz LeaveLoopCmps - mov eax, [esi + edx + 4] - xor eax, [edi + edx + 4] - jnz LeaveLoopCmps4 - add edx, 8 - jnz LoopCmps - jmp short LenMaximum -LeaveLoopCmps4: add edx, 4 -LeaveLoopCmps: test eax, 0000FFFFh - jnz LenLower - add edx, 2 - shr eax, 16 -LenLower: sub al, 1 - adc edx, 0 - -;;; Calculate the length of the match. If it is longer than MAX_MATCH, -;;; then automatically accept it as the best possible match and leave. - - lea eax, [edi + edx] - mov edi, [scan] - sub eax, edi - cmp eax, MAX_MATCH - jge LenMaximum - -;;; If the length of the match is not longer than the best match we -;;; have so far, then forget it and return to the lookup loop. - - mov edx, [deflatestate] - mov ebx, [bestlen] - cmp eax, ebx - jg LongerMatch - mov esi, [windowbestlen] - mov edi, [edx + dsPrev] - mov ebx, [scanend] - mov edx, [chainlenwmask] - jmp LookupLoop - -;;; s->match_start = cur_match; -;;; best_len = len; -;;; if (len >= nice_match) break; -;;; scan_end = *(ushf*)(scan+best_len-1); - -LongerMatch: mov ebx, [nicematch] - mov [bestlen], eax - mov [edx + dsMatchStart], ecx - cmp eax, ebx - jge LeaveNow - mov esi, [window] - add esi, eax - mov [windowbestlen], esi - movzx ebx, word ptr [edi + eax - 1] - mov edi, [edx + dsPrev] - mov [scanend], ebx - mov edx, [chainlenwmask] - jmp LookupLoop - -;;; Accept the current string, with the maximum possible length. - -LenMaximum: mov edx, [deflatestate] - mov dword ptr [bestlen], MAX_MATCH - mov [edx + dsMatchStart], ecx - -;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; -;;; return s->lookahead; - -LeaveNow: - mov edx, [deflatestate] - mov ebx, [bestlen] - mov eax, [edx + dsLookahead] - cmp ebx, eax - jg LookaheadRet - mov eax, ebx -LookaheadRet: - -;;; Restore the stack and return from whence we came. - - add esp, LocalVarsSize - pop ebx - pop esi - pop edi - pop ebp - - ret -; please don't remove this string ! -; Your can freely use match686 in any free or commercial app if you don't remove the string in the binary! - db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah - - - IFDEF NOUNDERLINE - longest_match endp - ELSE - _longest_match endp - ENDIF - - IFDEF NOUNDERLINE - match_init proc near - ret - match_init endp - ELSE - _match_init proc near - ret - _match_init endp - ENDIF - - -_TEXT ends -end diff --git a/contrib/masmx86/readme.txt b/contrib/masmx86/readme.txt deleted file mode 100644 index 3271f72..0000000 --- a/contrib/masmx86/readme.txt +++ /dev/null @@ -1,27 +0,0 @@ - -Summary -------- -This directory contains ASM implementations of the functions -longest_match() and inflate_fast(). - - -Use instructions ----------------- -Assemble using MASM, and copy the object files into the zlib source -directory, then run the appropriate makefile, as suggested below. You can -donwload MASM from here: - - http://www.microsoft.com/downloads/details.aspx?displaylang=en&FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64 - -You can also get objects files here: - - http://www.winimage.com/zLibDll/zlib124_masm_obj.zip - -Build instructions ------------------- -* With Microsoft C and MASM: -nmake -f win32/Makefile.msc LOC="-DASMV -DASMINF" OBJA="match686.obj inffas32.obj" - -* With Borland C and TASM: -make -f win32/Makefile.bor LOCAL_ZLIB="-DASMV -DASMINF" OBJA="match686.obj inffas32.obj" OBJPA="+match686c.obj+match686.obj+inffas32.obj" - diff --git a/win32/Makefile.bor b/win32/Makefile.bor index d152bbb..4495353 100644 --- a/win32/Makefile.bor +++ b/win32/Makefile.bor @@ -3,7 +3,6 @@ # # Usage: # make -f win32/Makefile.bor -# make -f win32/Makefile.bor LOCAL_ZLIB=-DASMV OBJA=match.obj OBJPA=+match.obj # ------------ Borland C++ ------------ diff --git a/win32/Makefile.gcc b/win32/Makefile.gcc index 305be50..081e391 100644 --- a/win32/Makefile.gcc +++ b/win32/Makefile.gcc @@ -11,10 +11,6 @@ # # make -fwin32/Makefile.gcc; make test testdll -fwin32/Makefile.gcc # -# To use the asm code, type: -# cp contrib/asm?86/match.S ./match.S -# make LOC=-DASMV OBJA=match.o -fwin32/Makefile.gcc -# # To install libz.a, zconf.h and zlib.h in the system directories, type: # # make install -fwin32/Makefile.gcc @@ -38,7 +34,6 @@ IMPLIB = libz.dll.a # SHARED_MODE=0 -#LOC = -DASMV #LOC = -DZLIB_DEBUG -g PREFIX = diff --git a/win32/Makefile.msc b/win32/Makefile.msc index 6831882..9c65153 100644 --- a/win32/Makefile.msc +++ b/win32/Makefile.msc @@ -4,10 +4,6 @@ # Usage: # nmake -f win32/Makefile.msc (standard build) # nmake -f win32/Makefile.msc LOC=-DFOO (nonstandard build) -# nmake -f win32/Makefile.msc LOC="-DASMV -DASMINF" \ -# OBJA="inffas32.obj match686.obj" (use ASM code, x86) -# nmake -f win32/Makefile.msc AS=ml64 LOC="-DASMV -DASMINF -I." \ -# OBJA="inffasx64.obj gvmat64.obj inffas8664.obj" (use ASM code, x64) # The toplevel directory of the source tree. # -- 2.44.0