]> git.lizzy.rs Git - plan9front.git/commitdiff
libc and ape support for amd64
authorcinap_lenrek <cinap_lenrek@felloff.net>
Sat, 1 Feb 2014 09:31:41 +0000 (10:31 +0100)
committercinap_lenrek <cinap_lenrek@felloff.net>
Sat, 1 Feb 2014 09:31:41 +0000 (10:31 +0100)
51 files changed:
amd64/include/ape/float.h [new file with mode: 0644]
amd64/include/ape/inttypes.h [new file with mode: 0644]
amd64/include/ape/math.h [new file with mode: 0644]
amd64/include/ape/stdarg.h [new file with mode: 0644]
amd64/include/ape/ureg.h [new file with mode: 0644]
sys/src/ape/lib/9/amd64/getcallerpc.s [new file with mode: 0644]
sys/src/ape/lib/9/amd64/getfcr.s [new file with mode: 0644]
sys/src/ape/lib/ap/amd64/_seek.c [new file with mode: 0644]
sys/src/ape/lib/ap/amd64/cycles.s [new file with mode: 0644]
sys/src/ape/lib/ap/amd64/lock.c [new file with mode: 0644]
sys/src/ape/lib/ap/amd64/main9.s [new file with mode: 0644]
sys/src/ape/lib/ap/amd64/main9p.s [new file with mode: 0644]
sys/src/ape/lib/ap/amd64/mkfile [new file with mode: 0644]
sys/src/ape/lib/ap/amd64/notetramp.c [new file with mode: 0644]
sys/src/ape/lib/ap/amd64/setjmp.s [new file with mode: 0644]
sys/src/ape/lib/ap/amd64/strchr.s [new file with mode: 0644]
sys/src/ape/lib/ap/amd64/strlen.s [new file with mode: 0644]
sys/src/ape/lib/ap/amd64/tas.s [new file with mode: 0644]
sys/src/libc/amd64/_seek.c [new file with mode: 0644]
sys/src/libc/amd64/argv0.s [new file with mode: 0644]
sys/src/libc/amd64/atom.s [new file with mode: 0644]
sys/src/libc/amd64/cycles.s [new file with mode: 0644]
sys/src/libc/amd64/getcallerpc.s [new file with mode: 0644]
sys/src/libc/amd64/getfcr.s [new file with mode: 0644]
sys/src/libc/amd64/main9.s [new file with mode: 0644]
sys/src/libc/amd64/main9p.s [new file with mode: 0644]
sys/src/libc/amd64/memccpy.s [new file with mode: 0644]
sys/src/libc/amd64/memchr.s [new file with mode: 0644]
sys/src/libc/amd64/memcmp.s [new file with mode: 0644]
sys/src/libc/amd64/memcpy.s [new file with mode: 0644]
sys/src/libc/amd64/memmove.s [new file with mode: 0644]
sys/src/libc/amd64/memset.s [new file with mode: 0644]
sys/src/libc/amd64/mkfile [new file with mode: 0644]
sys/src/libc/amd64/muldiv.s [new file with mode: 0644]
sys/src/libc/amd64/notejmp.c [new file with mode: 0644]
sys/src/libc/amd64/setjmp.s [new file with mode: 0644]
sys/src/libc/amd64/sqrt.s [new file with mode: 0644]
sys/src/libc/amd64/strcat.s [new file with mode: 0644]
sys/src/libc/amd64/strchr.s [new file with mode: 0644]
sys/src/libc/amd64/strcpy.s [new file with mode: 0644]
sys/src/libc/amd64/strlen.s [new file with mode: 0644]
sys/src/libc/amd64/tas.s [new file with mode: 0644]
sys/src/libmp/amd64/mkfile [new file with mode: 0644]
sys/src/libmp/amd64/mpdigdiv.s [new file with mode: 0644]
sys/src/libmp/amd64/mpvecadd.s [new file with mode: 0644]
sys/src/libmp/amd64/mpvecdigmuladd.s [new file with mode: 0644]
sys/src/libmp/amd64/mpvecdigmulsub.s [new file with mode: 0644]
sys/src/libmp/amd64/mpvecsub.s [new file with mode: 0644]
sys/src/libsec/amd64/md5block.s [new file with mode: 0644]
sys/src/libsec/amd64/mkfile [new file with mode: 0644]
sys/src/libsec/amd64/sha1block.s [new file with mode: 0644]

diff --git a/amd64/include/ape/float.h b/amd64/include/ape/float.h
new file mode 100644 (file)
index 0000000..4982868
--- /dev/null
@@ -0,0 +1,80 @@
+#ifndef __FLOAT
+#define __FLOAT
+/* IEEE, default rounding */
+
+#define FLT_ROUNDS     1
+#define FLT_RADIX      2
+
+#define FLT_DIG                6
+#define FLT_EPSILON    1.19209290e-07
+#define FLT_MANT_DIG   24
+#define FLT_MAX                3.40282347e+38
+#define FLT_MAX_10_EXP 38
+#define FLT_MAX_EXP    128
+#define FLT_MIN                1.17549435e-38
+#define FLT_MIN_10_EXP -37
+#define FLT_MIN_EXP    -125
+
+#define DBL_DIG                15
+#define DBL_EPSILON    2.2204460492503131e-16
+#define DBL_MANT_DIG   53
+#define DBL_MAX                1.797693134862315708145e+308
+#define DBL_MAX_10_EXP 308
+#define DBL_MAX_EXP    1024
+#define DBL_MIN                2.225073858507201383090233e-308
+#define DBL_MIN_10_EXP -307
+#define DBL_MIN_EXP    -1021
+#define LDBL_MANT_DIG  DBL_MANT_DIG
+#define LDBL_EPSILON   DBL_EPSILON
+#define LDBL_DIG       DBL_DIG
+#define LDBL_MIN_EXP   DBL_MIN_EXP
+#define LDBL_MIN       DBL_MIN
+#define LDBL_MIN_10_EXP        DBL_MIN_10_EXP
+#define LDBL_MAX_EXP   DBL_MAX_EXP
+#define LDBL_MAX       DBL_MAX
+#define LDBL_MAX_10_EXP        DBL_MAX_10_EXP
+
+typedef        union FPdbleword FPdbleword;
+union FPdbleword
+{
+       double  x;
+       struct {        /* little endian */
+               long lo;
+               long hi;
+       };
+};
+
+#ifdef _RESEARCH_SOURCE
+/* define stuff needed for floating conversion */
+#define IEEE_8087      1
+#define Sudden_Underflow 1
+#endif
+#ifdef _PLAN9_SOURCE
+/* MXCSR */
+/* fcr */
+#define        FPFTZ   (1<<15) /* amd64 */
+#define        FPINEX  (1<<12)
+#define        FPUNFL  (1<<11)
+#define        FPOVFL  (1<<10)
+#define        FPZDIV  (1<<9)
+#define        FPDNRM  (1<<8)  /* amd64 */
+#define        FPINVAL (1<<7)
+#define        FPDAZ   (1<<6)  /* amd64 */
+#define        FPRNR   (0<<13)
+#define        FPRZ    (3<<13)
+#define        FPRPINF (2<<13)
+#define        FPRNINF (1<<13)
+#define        FPRMASK (3<<13)
+#define        FPPEXT  0
+#define        FPPSGL  0
+#define        FPPDBL  0
+#define        FPPMASK 0
+/* fsr */
+#define        FPAINEX (1<<5)
+#define        FPAUNFL (1<<4)
+#define        FPAOVFL (1<<3)
+#define        FPAZDIV (1<<2)
+#define        FPADNRM (1<<1)  /* not in plan 9 */
+#define        FPAINVAL        (1<<0)
+#endif
+#endif /* __FLOAT */
diff --git a/amd64/include/ape/inttypes.h b/amd64/include/ape/inttypes.h
new file mode 100644 (file)
index 0000000..0ee22cc
--- /dev/null
@@ -0,0 +1,21 @@
+#ifndef _SUSV2_SOURCE
+#error "inttypes.h is SUSV2"
+#endif
+
+#ifndef _INTTYPES_H_
+#define _INTTYPES_H_ 1
+
+
+typedef char int8_t;
+typedef short int16_t;
+typedef int int32_t;
+typedef long long int64_t;
+typedef unsigned char uint8_t;
+typedef unsigned short uint16_t;
+typedef unsigned int uint32_t;
+typedef unsigned long long uint64_t;
+
+typedef long long intptr_t;
+typedef unsigned long long uintptr_t;
+
+#endif
diff --git a/amd64/include/ape/math.h b/amd64/include/ape/math.h
new file mode 100644 (file)
index 0000000..a880a01
--- /dev/null
@@ -0,0 +1,78 @@
+#ifndef __MATH
+#define __MATH
+#pragma lib "/$M/lib/ape/libap.a"
+
+/* a HUGE_VAL appropriate for IEEE double-precision */
+/* the correct value, 1.797693134862316e+308, causes a ken overflow */
+#define HUGE_VAL 1.79769313486231e+308
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern double acos(double);
+extern double asin(double);
+extern double atan(double);
+extern double atan2(double, double);
+extern double cos(double);
+extern double hypot(double, double);
+extern double sin(double);
+extern double tan(double);
+extern double cosh(double);
+extern double sinh(double);
+extern double tanh(double);
+extern double exp(double);
+extern double frexp(double, int *);
+extern double ldexp(double, int);
+extern double log(double);
+extern double log10(double);
+extern double modf(double, double *);
+extern double pow(double, double);
+extern double sqrt(double);
+extern double ceil(double);
+extern double fabs(double);
+extern double floor(double);
+extern double fmod(double, double);
+extern double NaN(void);
+extern int isNaN(double);
+extern double Inf(int);
+extern int isInf(double, int);
+
+#ifdef _RESEARCH_SOURCE
+/* does >> treat left operand as unsigned ? */
+#define Unsigned_Shifts 1
+#define        M_E             2.7182818284590452354   /* e */
+#define        M_LOG2E         1.4426950408889634074   /* log 2e */
+#define        M_LOG10E        0.43429448190325182765  /* log 10e */
+#define        M_LN2           0.69314718055994530942  /* log e2 */
+#define        M_LN10          2.30258509299404568402  /* log e10 */
+#define        M_PI            3.14159265358979323846  /* pi */
+#define        M_PI_2          1.57079632679489661923  /* pi/2 */
+#define        M_PI_4          0.78539816339744830962  /* pi/4 */
+#define        M_1_PI          0.31830988618379067154  /* 1/pi */
+#define        M_2_PI          0.63661977236758134308  /* 2/pi */
+#define        M_2_SQRTPI      1.12837916709551257390  /* 2/sqrt(pi) */
+#define        M_SQRT2         1.41421356237309504880  /* sqrt(2) */
+#define        M_SQRT1_2       0.70710678118654752440  /* 1/sqrt(2) */
+
+extern double hypot(double, double);
+extern double erf(double);
+extern double erfc(double);
+extern double j0(double);
+extern double y0(double);
+extern double j1(double);
+extern double y1(double);
+extern double jn(int, double);
+extern double yn(int, double);
+
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#define isnan(x) isNaN(x)
+#define isinf(x) isInf(x, 0)
+
+#endif /* __MATH */
diff --git a/amd64/include/ape/stdarg.h b/amd64/include/ape/stdarg.h
new file mode 100644 (file)
index 0000000..cf46684
--- /dev/null
@@ -0,0 +1,18 @@
+#ifndef __STDARG
+#define __STDARG
+
+typedef char *va_list;
+
+#define va_start(list, start) list = (sizeof(start)<8 ? (char *)((long long *)&(start)+1) : \
+(char *)(&(start)+1))
+#define va_end(list)
+#define va_arg(list, mode)\
+       ((sizeof(mode) == 1)?\
+               ((mode*)(list += 8))[-8]:\
+       (sizeof(mode) == 2)?\
+               ((mode*)(list += 8))[-4]:\
+       (sizeof(mode) == 4)?\
+               ((mode*)(list += 8))[-2]:\
+               ((mode*)(list += sizeof(mode)))[-1])
+
+#endif /* __STDARG */
diff --git a/amd64/include/ape/ureg.h b/amd64/include/ape/ureg.h
new file mode 100644 (file)
index 0000000..f6b2aba
--- /dev/null
@@ -0,0 +1,38 @@
+#ifndef __UREG_H
+#define __UREG_H
+#if !defined(_PLAN9_SOURCE)
+    This header file is an extension to ANSI/POSIX
+#endif
+
+struct Ureg {
+       unsigned long long      ax;
+       unsigned long long      bx;
+       unsigned long long      cx;
+       unsigned long long      dx;
+       unsigned long long      si;
+       unsigned long long      di;
+       unsigned long long      bp;
+       unsigned long long      r8;
+       unsigned long long      r9;
+       unsigned long long      r10;
+       unsigned long long      r11;
+       unsigned long long      r12;
+       unsigned long long      r13;
+       unsigned long long      r14;
+       unsigned long long      r15;
+
+       unsigned short          ds;
+       unsigned short          es;
+       unsigned short          fs;
+       unsigned short          gs;
+
+       unsigned long long      type;
+       unsigned long long      error;          /* error code (or zero) */
+       unsigned long long      pc;             /* pc */
+       unsigned long long      cs;             /* old context */
+       unsigned long long      flags;          /* old flags */
+       unsigned long long      sp;             /* sp */
+       unsigned long long      ss;             /* old stack segment */
+};
+
+#endif
diff --git a/sys/src/ape/lib/9/amd64/getcallerpc.s b/sys/src/ape/lib/9/amd64/getcallerpc.s
new file mode 100644 (file)
index 0000000..15e55ea
--- /dev/null
@@ -0,0 +1,3 @@
+TEXT getcallerpc(SB), 1, $0
+       MOVQ    -8(RARG), AX
+       RET
diff --git a/sys/src/ape/lib/9/amd64/getfcr.s b/sys/src/ape/lib/9/amd64/getfcr.s
new file mode 100644 (file)
index 0000000..9fc2002
--- /dev/null
@@ -0,0 +1,38 @@
+
+TEXT   setfcr(SB), $4
+       XORL    $(0x3F<<7),RARG /* bits are cleared in csr to enable them */
+       ANDL    $0xFFC0, RARG   /* just the fcr bits */
+       WAIT    /* is this needed? */
+       STMXCSR 0(SP)
+       MOVL    0(SP), AX
+       ANDL    $~0x3F, AX
+       ORL     RARG, AX
+       MOVL    AX, 0(SP)
+       LDMXCSR 0(SP)
+       RET
+
+TEXT   getfcr(SB), $4
+       WAIT
+       STMXCSR 0(SP)
+       MOVWLZX 0(SP), AX
+       ANDL    $0xFFC0, AX
+       XORL    $(0x3F<<7),AX
+       RET
+
+TEXT   getfsr(SB), $4
+       WAIT
+       STMXCSR 0(SP)
+       MOVL    0(SP), AX
+       ANDL    $0x3F, AX
+       RET
+
+TEXT   setfsr(SB), $4
+       ANDL    $0x3F, RARG
+       WAIT
+       STMXCSR 0(SP)
+       MOVL    0(SP), AX
+       ANDL    $~0x3F, AX
+       ORL     RARG, AX
+       MOVL    AX, 0(SP)
+       LDMXCSR 0(SP)
+       RET
diff --git a/sys/src/ape/lib/ap/amd64/_seek.c b/sys/src/ape/lib/ap/amd64/_seek.c
new file mode 100644 (file)
index 0000000..a9d92c7
--- /dev/null
@@ -0,0 +1,11 @@
+extern long __SEEK(long long*, int, long long, int);
+
+long long
+_SEEK(int fd, long long o, int p)
+{
+       long long l;
+
+       if(__SEEK(&l, fd, o, p) < 0)
+               l = -1;
+       return l;
+}
diff --git a/sys/src/ape/lib/ap/amd64/cycles.s b/sys/src/ape/lib/ap/amd64/cycles.s
new file mode 100644 (file)
index 0000000..769f617
--- /dev/null
@@ -0,0 +1,5 @@
+TEXT _cycles(SB),1,$0                          /* time stamp counter; cycles since power up */
+       RDTSC
+       MOVL    AX, 0(RARG)                     /* lo */
+       MOVL    DX, 4(RARG)                     /* hi */
+       RET
diff --git a/sys/src/ape/lib/ap/amd64/lock.c b/sys/src/ape/lib/ap/amd64/lock.c
new file mode 100644 (file)
index 0000000..91c0ba2
--- /dev/null
@@ -0,0 +1,26 @@
+#define _LOCK_EXTENSION
+#include "../plan9/sys9.h"
+#include <lock.h>
+
+int    tas(int*);
+
+void
+lock(Lock *lk)
+{
+       while(tas(&lk->val))
+               _SLEEP(0);
+}
+
+int
+canlock(Lock *lk)
+{
+       if(tas(&lk->val))
+               return 0;
+       return 1;
+}
+
+void
+unlock(Lock *lk)
+{
+       lk->val = 0;
+}
diff --git a/sys/src/ape/lib/ap/amd64/main9.s b/sys/src/ape/lib/ap/amd64/main9.s
new file mode 100644 (file)
index 0000000..d461dc1
--- /dev/null
@@ -0,0 +1,26 @@
+#define NPRIVATES      16
+
+GLOBL  _tos(SB), $8
+GLOBL  _errnoloc(SB), $8
+GLOBL  _privates(SB), $8
+GLOBL  _nprivates(SB), $8
+
+TEXT   _main(SB), 1, $(32+NPRIVATES*8)
+
+       /* _tos = arg */
+       MOVQ    AX, _tos(SB)
+       LEAQ    24(SP), AX
+       MOVQ    AX, _errnoloc(SB)
+       LEAQ    32(SP), AX
+       MOVQ    AX, _privates(SB)
+       MOVQ    $NPRIVATES, _nprivates(SB)
+       CALL    _envsetup(SB)
+       MOVL    inargc-8(FP), RARG
+       LEAQ    inargv+0(FP), AX
+       MOVQ    AX, 8(SP)
+       MOVQ    environ(SB), AX
+       MOVQ    AX, 16(SP)
+       CALL    main(SB)
+       MOVQ    AX, RARG
+       CALL    exit(SB)
+       RET
diff --git a/sys/src/ape/lib/ap/amd64/main9p.s b/sys/src/ape/lib/ap/amd64/main9p.s
new file mode 100644 (file)
index 0000000..dfd3450
--- /dev/null
@@ -0,0 +1,45 @@
+#define NPRIVATES      16
+
+GLOBL  _tos(SB), $8
+GLOBL  _privates(SB), $8
+GLOBL  _nprivates(SB), $8
+
+TEXT   _mainp(SB), 1, $(3*8+NPRIVATES*8)
+
+       /* _tos = arg */
+       MOVQ    AX, _tos(SB)
+       LEAQ    8(SP), AX
+       MOVQ    AX, _privates(SB)
+       MOVQ    $NPRIVATES, _nprivates(SB)
+
+       /* _profmain(); */
+       CALL    _profmain(SB)
+
+       /* _tos->prof.pp = _tos->prof.next; */
+       MOVQ    _tos+0(SB),DX
+       MOVQ    4(DX),CX
+       MOVQ    CX,(DX)
+
+       CALL    _envsetup(SB)
+
+       /* main(argc, argv, environ); */
+       MOVL    inargc-8(FP), RARG
+       LEAQ    inargv+0(FP), AX
+       MOVQ    AX, 8(SP)
+       MOVQ    environ(SB), AX
+       MOVQ    AX, 16(SP)
+       CALL    main(SB)
+
+loop:
+       MOVL    AX, RARG
+       CALL    exit(SB)
+       MOVQ    $_profin(SB), AX        /* force loading of profile */
+       MOVL    $0, AX
+       JMP     loop
+
+TEXT   _savearg(SB), 1, $0
+       RET
+
+TEXT   _callpc(SB), 1, $0
+       MOVQ    8(RARG), AX
+       RET
diff --git a/sys/src/ape/lib/ap/amd64/mkfile b/sys/src/ape/lib/ap/amd64/mkfile
new file mode 100644 (file)
index 0000000..5bb7f32
--- /dev/null
@@ -0,0 +1,20 @@
+APE=/sys/src/ape
+objtype=amd64
+<$APE/config
+LIB=/$objtype/lib/ape/libap.a
+OFILES=\
+       _seek.$O\
+       cycles.$O\
+       lock.$O\
+       main9.$O\
+       main9p.$O\
+       notetramp.$O\
+       setjmp.$O\
+       strchr.$O\
+       strlen.$O\
+       tas.$O\
+
+</sys/src/cmd/mksyslib
+
+CFLAGS=-c -D_POSIX_SOURCE -D_PLAN9_SOURCE
+
diff --git a/sys/src/ape/lib/ap/amd64/notetramp.c b/sys/src/ape/lib/ap/amd64/notetramp.c
new file mode 100644 (file)
index 0000000..c922051
--- /dev/null
@@ -0,0 +1,81 @@
+#include "../plan9/lib.h"
+#include "../plan9/sys9.h"
+#include <signal.h>
+#include <setjmp.h>
+
+/* A stack to hold pcs when signals nest */
+#define MAXSIGSTACK 20
+typedef struct Pcstack Pcstack;
+static struct Pcstack {
+       int sig;
+       void (*hdlr)(int, char*, Ureg*);
+       unsigned long long restorepc;
+       Ureg *u;
+} pcstack[MAXSIGSTACK];
+static int nstack = 0;
+
+static void notecont(Ureg*, char*);
+
+void
+_notetramp(int sig, void (*hdlr)(int, char*, Ureg*), Ureg *u)
+{
+       Pcstack *p;
+
+       if(nstack >= MAXSIGSTACK)
+               _NOTED(1);      /* nesting too deep; just do system default */
+       p = &pcstack[nstack];
+       p->restorepc = u->pc;
+       p->sig = sig;
+       p->hdlr = hdlr;
+       p->u = u;
+       nstack++;
+       u->pc = (unsigned long long) notecont;
+       _NOTED(2);      /* NSAVE: clear note but hold state */
+}
+
+static void
+notecont(Ureg *u, char *s)
+{
+       Pcstack *p;
+       void(*f)(int, char*, Ureg*);
+
+       p = &pcstack[nstack-1];
+       f = p->hdlr;
+       u->pc = p->restorepc;
+       nstack--;
+       (*f)(p->sig, s, u);
+       _NOTED(3);      /* NRSTR */
+}
+
+#define JMPBUFPC 1
+#define JMPBUFSP 0
+
+extern sigset_t        _psigblocked;
+
+typedef struct {
+       sigset_t set;
+       sigset_t blocked;
+       unsigned long long jmpbuf[2];
+} sigjmp_buf_amd64;
+
+void
+siglongjmp(sigjmp_buf j, int ret)
+{
+       struct Ureg *u;
+       sigjmp_buf_amd64 *jb;
+
+       jb = (sigjmp_buf_amd64*)j;
+
+       if(jb->set)
+               _psigblocked = jb->blocked;
+       if(nstack == 0 || pcstack[nstack-1].u->sp > jb->jmpbuf[JMPBUFSP])
+               longjmp((void*)jb->jmpbuf, ret);
+       u = pcstack[nstack-1].u;
+       nstack--;
+       u->ax = ret;
+       if(ret == 0)
+               u->ax = 1;
+       u->pc = jb->jmpbuf[JMPBUFPC];
+       u->sp = jb->jmpbuf[JMPBUFSP] + 8;
+       _NOTED(3);      /* NRSTR */
+}
diff --git a/sys/src/ape/lib/ap/amd64/setjmp.s b/sys/src/ape/lib/ap/amd64/setjmp.s
new file mode 100644 (file)
index 0000000..67320ca
--- /dev/null
@@ -0,0 +1,27 @@
+TEXT   longjmp(SB), $0
+       MOVL    r+8(FP), AX
+       CMPL    AX, $0
+       JNE     ok              /* ansi: "longjmp(0) => longjmp(1)" */
+       MOVL    $1, AX          /* bless their pointed heads */
+ok:
+       MOVQ    0(RARG), SP     /* restore sp */
+       MOVQ    8(RARG), BX     /* put return pc on the stack */
+       MOVQ    BX, 0(SP)
+       RET
+
+TEXT   setjmp(SB), $0
+       MOVQ    SP, 0(RARG)     /* store sp */
+       MOVQ    0(SP), BX       /* store return pc */
+       MOVQ    BX, 8(RARG)
+       MOVL    $0, AX          /* return 0 */
+       RET
+
+TEXT   sigsetjmp(SB), $0
+       MOVL    savemask+8(FP), BX
+       MOVL    BX, 0(RARG)
+       MOVL    $_psigblocked(SB), 4(RARG)
+       MOVQ    SP, 8(RARG)     /* store sp */
+       MOVQ    0(SP), BX       /* store return pc */
+       MOVQ    BX, 16(RARG)
+       MOVL    $0, AX  /* return 0 */
+       RET
diff --git a/sys/src/ape/lib/ap/amd64/strchr.s b/sys/src/ape/lib/ap/amd64/strchr.s
new file mode 100644 (file)
index 0000000..3175373
--- /dev/null
@@ -0,0 +1,38 @@
+       TEXT    strchr(SB), $0
+
+       MOVQ    RARG, DI
+       MOVB    c+8(FP), AX
+       CMPB    AX, $0
+       JEQ     l2      /**/
+
+/*
+ * char is not null
+ */
+l1:
+       MOVB    (DI), BX
+       CMPB    BX, $0
+       JEQ     ret0
+       ADDQ    $1, DI
+       CMPB    AX, BX
+       JNE     l1
+
+       MOVQ    DI, AX
+       SUBQ    $1, AX
+       RET
+
+/*
+ * char is null
+ */
+l2:
+       MOVQ    $-1, CX
+       CLD
+
+       REPN;   SCASB
+
+       MOVQ    DI, AX
+       SUBQ    $1, AX
+       RET
+
+ret0:
+       MOVQ    $0, AX
+       RET
diff --git a/sys/src/ape/lib/ap/amd64/strlen.s b/sys/src/ape/lib/ap/amd64/strlen.s
new file mode 100644 (file)
index 0000000..cf27cd5
--- /dev/null
@@ -0,0 +1,16 @@
+       TEXT    strlen(SB),$0
+
+       MOVL    $0, AX
+       MOVQ    $-1, CX
+       CLD
+/*
+ * look for end of string
+ */
+
+       MOVQ    RARG, DI
+       REPN;   SCASB
+
+       MOVQ    DI, AX
+       SUBQ    RARG, AX
+       SUBQ    $1, AX
+       RET
diff --git a/sys/src/ape/lib/ap/amd64/tas.s b/sys/src/ape/lib/ap/amd64/tas.s
new file mode 100644 (file)
index 0000000..7aa994b
--- /dev/null
@@ -0,0 +1,5 @@
+TEXT   tas(SB),$0
+
+       MOVL    $0xdeadead,AX
+       XCHGL   AX,(RARG)
+       RET
diff --git a/sys/src/libc/amd64/_seek.c b/sys/src/libc/amd64/_seek.c
new file mode 100644 (file)
index 0000000..d44420d
--- /dev/null
@@ -0,0 +1,14 @@
+#include <u.h>
+#include <libc.h>
+
+extern int _seek(vlong*, int, vlong, int);
+
+vlong
+seek(int fd, vlong o, int p)
+{
+       vlong l;
+
+       if(_seek(&l, fd, o, p) < 0)
+               l = -1LL;
+       return l;
+}
diff --git a/sys/src/libc/amd64/argv0.s b/sys/src/libc/amd64/argv0.s
new file mode 100644 (file)
index 0000000..fe4eb4c
--- /dev/null
@@ -0,0 +1,4 @@
+GLOBL  argv0(SB), $8
+GLOBL  _tos(SB), $8
+GLOBL  _privates(SB), $8
+GLOBL  _nprivates(SB), $4
diff --git a/sys/src/libc/amd64/atom.s b/sys/src/libc/amd64/atom.s
new file mode 100644 (file)
index 0000000..c661887
--- /dev/null
@@ -0,0 +1,69 @@
+TEXT ainc(SB), 1, $0   /* int ainc(int *); */
+ainclp:
+       MOVL    (RARG), AX      /* exp */
+       MOVL    AX, BX
+       INCL    BX              /* new */
+       LOCK; CMPXCHGL BX, (RARG)
+       JNZ     ainclp
+       MOVL    BX, AX
+       RET
+
+TEXT adec(SB), 1, $0   /* int adec(int*); */
+adeclp:
+       MOVL    (RARG), AX
+       MOVL    AX, BX
+       DECL    BX
+       LOCK; CMPXCHGL BX, (RARG)
+       JNZ     adeclp
+       MOVL    BX, AX
+       RET
+
+/*
+ * int cas32(u32int *p, u32int ov, u32int nv);
+ * int cas(uint *p, int ov, int nv);
+ * int casl(ulong *p, ulong ov, ulong nv);
+ */
+
+TEXT cas32(SB), 1, $0
+TEXT cas(SB), 1, $0
+TEXT casul(SB), 1, $0
+TEXT casl(SB), 1, $0                   /* back compat */
+       MOVL    exp+8(FP), AX
+       MOVL    new+16(FP), BX
+       LOCK; CMPXCHGL BX, (RARG)
+       MOVL    $1, AX                          /* use CMOVLEQ etc. here? */
+       JNZ     _cas32r0
+_cas32r1:
+       RET
+_cas32r0:
+       DECL    AX
+       RET
+
+/*
+ * int cas64(u64int *p, u64int ov, u64int nv);
+ * int casp(void **p, void *ov, void *nv);
+ */
+
+TEXT cas64(SB), 1, $0
+TEXT casp(SB), 1, $0
+       MOVQ    exp+8(FP), AX
+       MOVQ    new+16(FP), BX
+       LOCK; CMPXCHGQ BX, (RARG)
+       MOVL    $1, AX                          /* use CMOVLEQ etc. here? */
+       JNZ     _cas64r0
+_cas64r1:
+       RET
+_cas64r0:
+       DECL    AX
+       RET
+
+TEXT fas64(SB), 1, $-4
+TEXT fasp(SB), 1, $-4
+       MOVQ    p+8(FP), AX
+       LOCK; XCHGQ     AX, (RARG)                      /*  */
+       RET
+
+TEXT fas32(SB), 1, $-4
+       MOVL    p+8(FP), AX
+       LOCK; XCHGL     AX, (RARG)                      /*  */
+       RET
diff --git a/sys/src/libc/amd64/cycles.s b/sys/src/libc/amd64/cycles.s
new file mode 100644 (file)
index 0000000..3c00956
--- /dev/null
@@ -0,0 +1,5 @@
+TEXT cycles(SB),1,$0                           /* time stamp counter; cycles since power up */
+       RDTSC
+       MOVL    AX, 0(RARG)                     /* lo */
+       MOVL    DX, 4(RARG)                     /* hi */
+       RET
diff --git a/sys/src/libc/amd64/getcallerpc.s b/sys/src/libc/amd64/getcallerpc.s
new file mode 100644 (file)
index 0000000..15e55ea
--- /dev/null
@@ -0,0 +1,3 @@
+TEXT getcallerpc(SB), 1, $0
+       MOVQ    -8(RARG), AX
+       RET
diff --git a/sys/src/libc/amd64/getfcr.s b/sys/src/libc/amd64/getfcr.s
new file mode 100644 (file)
index 0000000..9fc2002
--- /dev/null
@@ -0,0 +1,38 @@
+
+TEXT   setfcr(SB), $4
+       XORL    $(0x3F<<7),RARG /* bits are cleared in csr to enable them */
+       ANDL    $0xFFC0, RARG   /* just the fcr bits */
+       WAIT    /* is this needed? */
+       STMXCSR 0(SP)
+       MOVL    0(SP), AX
+       ANDL    $~0x3F, AX
+       ORL     RARG, AX
+       MOVL    AX, 0(SP)
+       LDMXCSR 0(SP)
+       RET
+
+TEXT   getfcr(SB), $4
+       WAIT
+       STMXCSR 0(SP)
+       MOVWLZX 0(SP), AX
+       ANDL    $0xFFC0, AX
+       XORL    $(0x3F<<7),AX
+       RET
+
+TEXT   getfsr(SB), $4
+       WAIT
+       STMXCSR 0(SP)
+       MOVL    0(SP), AX
+       ANDL    $0x3F, AX
+       RET
+
+TEXT   setfsr(SB), $4
+       ANDL    $0x3F, RARG
+       WAIT
+       STMXCSR 0(SP)
+       MOVL    0(SP), AX
+       ANDL    $~0x3F, AX
+       ORL     RARG, AX
+       MOVL    AX, 0(SP)
+       LDMXCSR 0(SP)
+       RET
diff --git a/sys/src/libc/amd64/main9.s b/sys/src/libc/amd64/main9.s
new file mode 100644 (file)
index 0000000..c9d5853
--- /dev/null
@@ -0,0 +1,19 @@
+#define NPRIVATES      16
+
+TEXT   _main(SB), 1, $(2*8+NPRIVATES*8)
+       MOVQ    AX, _tos(SB)
+       LEAQ    16(SP), AX
+       MOVQ    AX, _privates(SB)
+       MOVL    $NPRIVATES, _nprivates(SB)
+       MOVL    inargc-8(FP), RARG
+       LEAQ    inargv+0(FP), AX
+       MOVQ    AX, 8(SP)
+       CALL    main(SB)
+
+loop:
+       MOVQ    $_exits<>(SB), RARG
+       CALL    exits(SB)
+       JMP     loop
+
+DATA   _exits<>+0(SB)/4, $"main"
+GLOBL  _exits<>+0(SB), $5
diff --git a/sys/src/libc/amd64/main9p.s b/sys/src/libc/amd64/main9p.s
new file mode 100644 (file)
index 0000000..353b180
--- /dev/null
@@ -0,0 +1,41 @@
+#define NPRIVATES      16
+
+TEXT _mainp(SB), 1, $(2*8+NPRIVATES*8)
+       MOVQ    AX, _tos(SB)            /* _tos = arg */
+       LEAQ    16(SP), AX
+       MOVQ    AX, _privates(SB)
+       MOVL    $NPRIVATES, _nprivates(SB)
+
+       CALL    _profmain(SB)           /* _profmain(); */
+
+       MOVQ    _tos+0(SB), DX          /* _tos->prof.pp = _tos->prof.next; */
+       MOVQ    8(DX), CX
+       MOVQ    CX, (DX)
+
+       MOVL    inargc-8(FP), RARG      /* main(argc, argv); */
+       LEAQ    inargv+0(FP), AX
+       MOVQ    AX, 8(SP)
+       CALL    main(SB)
+
+loop:
+       MOVQ    $_exits<>(SB), RARG
+       CALL    exits(SB)
+       MOVQ    $_profin(SB), AX        /* force loading of profile */
+       JMP     loop
+
+TEXT   _savearg(SB), 1, $0
+       MOVQ    RARG, AX
+       RET
+
+TEXT   _saveret(SB), 1, $0
+       RET
+
+TEXT   _restorearg(SB), 1, $0
+       RET                             /* we want RARG in RARG */
+
+TEXT   _callpc(SB), 1, $0
+       MOVQ    8(RARG), AX
+       RET
+
+DATA   _exits<>+0(SB)/4, $"main"
+GLOBL  _exits<>+0(SB), $5
diff --git a/sys/src/libc/amd64/memccpy.s b/sys/src/libc/amd64/memccpy.s
new file mode 100644 (file)
index 0000000..d878faa
--- /dev/null
@@ -0,0 +1,58 @@
+       TEXT    memccpy(SB),$0
+
+       MOVL    n+24(FP), CX
+       CMPL    CX, $0
+       JEQ     none
+       MOVQ    p2+8(FP), DI
+       MOVBLZX c+16(FP), AX
+       CLD
+/*
+ * find the character in the second string
+ */
+
+       REPN;   SCASB
+       JEQ     found
+
+/*
+ * if not found, set count to 'n'
+ */
+none:
+       MOVL    $0, AX
+       MOVL    n+24(FP), BX
+       JMP     memcpy
+
+/*
+ * if found, set count to bytes thru character
+ */
+found:
+       MOVQ    DI, AX
+       SUBQ    p2+8(FP), AX
+       MOVQ    AX, BX
+       ADDQ    RARG, AX
+
+/*
+ * copy the memory
+ */
+
+memcpy:
+       MOVQ    RARG, DI
+       MOVQ    p2+8(FP), SI
+/*
+ * copy whole longs, if aligned
+ */
+       MOVQ    DI, DX
+       ORQ     SI, DX
+       ANDL    $3, DX
+       JNE     c3
+       MOVL    BX, CX
+       SHRQ    $2, CX
+       REP;    MOVSL
+/*
+ * copy the rest, by bytes
+ */
+       ANDL    $3, BX
+c3:
+       MOVL    BX, CX
+       REP;    MOVSB
+
+       RET
diff --git a/sys/src/libc/amd64/memchr.s b/sys/src/libc/amd64/memchr.s
new file mode 100644 (file)
index 0000000..3648e84
--- /dev/null
@@ -0,0 +1,23 @@
+       TEXT    memchr(SB),$0
+
+       MOVL    n+16(FP), CX
+       CMPL    CX, $0
+       JEQ     none
+       MOVQ    RARG, DI
+       MOVBLZX c+8(FP), AX
+       CLD
+/*
+ * SCASB is memchr instruction
+ */
+
+       REPN;   SCASB
+       JEQ     found
+
+none:
+       MOVL    $0, AX
+       RET
+
+found:
+       MOVQ    DI, AX
+       SUBQ    $1, AX
+       RET
diff --git a/sys/src/libc/amd64/memcmp.s b/sys/src/libc/amd64/memcmp.s
new file mode 100644 (file)
index 0000000..71098b2
--- /dev/null
@@ -0,0 +1,52 @@
+       TEXT    memcmp(SB),$0
+
+       MOVL    n+16(FP), BX
+       CMPL    BX, $0
+       JEQ     none
+       MOVQ    RARG, DI
+       MOVQ    p2+8(FP), SI
+       CLD
+       MOVQ    DI, CX
+       ORQ     SI, CX
+       ANDL    $3, CX
+       JNE     c3
+/*
+ * first by longs
+ */
+
+       MOVL    BX, CX
+       SHRQ    $2, CX
+
+       REP;    CMPSL
+       JNE     found
+
+/*
+ * then by bytes
+ */
+       ANDL    $3, BX
+c3:
+       MOVL    BX, CX
+       REP;    CMPSB
+       JNE     found1
+
+none:
+       MOVQ    $0, AX
+       RET
+
+/*
+ * if long found,
+ * back up and look by bytes
+ */
+found:
+       MOVL    $4, CX
+       SUBQ    CX, DI
+       SUBQ    CX, SI
+       REP;    CMPSB
+
+found1:
+       JLS     lt
+       MOVQ    $-1, AX
+       RET
+lt:
+       MOVQ    $1, AX
+       RET
diff --git a/sys/src/libc/amd64/memcpy.s b/sys/src/libc/amd64/memcpy.s
new file mode 100644 (file)
index 0000000..878e802
--- /dev/null
@@ -0,0 +1,81 @@
+TEXT memcpy(SB), $0
+       MOVQ    RARG, DI
+       MOVQ    DI, AX                  /* return value */
+       MOVQ    p2+8(FP), SI
+       MOVL    n+16(FP), BX
+       CMPL    BX, $0
+       JGT     _ok
+       JEQ     _return                 /* nothing to do if n == 0 */
+       MOVL    $0, SI                  /* fault if n < 0 */
+
+/*
+ * check and set for backwards:
+ *     (p2 < p1) && ((p2+n) > p1)
+ */
+_ok:
+       CMPQ    SI, DI
+       JGT     _forward
+       JEQ     _return                 /* nothing to do if p2 == p1 */
+       MOVQ    SI, DX
+       ADDQ    BX, DX
+       CMPQ    DX, DI
+       JGT     _back
+
+/*
+ * copy whole longs if aligned
+ */
+_forward:
+       CLD
+       MOVQ    SI, DX
+       ORQ     DI, DX
+       ANDL    $3, DX
+       JNE     c3f
+       MOVQ    BX, CX
+       SHRQ    $2, CX
+       ANDL    $3, BX
+       REP;    MOVSL
+
+/*
+ * copy the rest, by bytes
+ */
+       JEQ     _return                 /* flags set by above ANDL */
+c3f:
+       MOVL    BX, CX
+       REP;    MOVSB
+
+       RET
+
+/*
+ * whole thing backwards has
+ * adjusted addresses
+ */
+_back:
+       ADDQ    BX, DI
+       ADDQ    BX, SI
+       STD
+       SUBQ    $4, DI
+       SUBQ    $4, SI
+/*
+ * copy whole longs, if aligned
+ */
+       MOVQ    DI, DX
+       ORQ     SI, DX
+       ANDL    $3, DX
+       JNE     c3b
+       MOVL    BX, CX
+       SHRQ    $2, CX
+       ANDL    $3, BX
+       REP;    MOVSL
+/*
+ * copy the rest, by bytes
+ */
+       JEQ     _return                 /* flags set by above ANDL */
+
+c3b:
+       ADDQ    $3, DI
+       ADDQ    $3, SI
+       MOVL    BX, CX
+       REP;    MOVSB
+
+_return:
+       RET
diff --git a/sys/src/libc/amd64/memmove.s b/sys/src/libc/amd64/memmove.s
new file mode 100644 (file)
index 0000000..1f00537
--- /dev/null
@@ -0,0 +1,81 @@
+TEXT memmove(SB), $0
+       MOVQ    RARG, DI
+       MOVQ    DI, AX                  /* return value */
+       MOVQ    p2+8(FP), SI
+       MOVL    n+16(FP), BX
+       CMPL    BX, $0
+       JGT     _ok
+       JEQ     _return                 /* nothing to do if n == 0 */
+       MOVL    $0, SI                  /* fault if n < 0 */
+
+/*
+ * check and set for backwards:
+ *     (p2 < p1) && ((p2+n) > p1)
+ */
+_ok:
+       CMPQ    SI, DI
+       JGT     _forward
+       JEQ     _return                 /* nothing to do if p2 == p1 */
+       MOVQ    SI, DX
+       ADDQ    BX, DX
+       CMPQ    DX, DI
+       JGT     _back
+
+/*
+ * copy whole longs if aligned
+ */
+_forward:
+       CLD
+       MOVQ    SI, DX
+       ORQ     DI, DX
+       ANDL    $3, DX
+       JNE     c3f
+       MOVQ    BX, CX
+       SHRQ    $2, CX
+       ANDL    $3, BX
+       REP;    MOVSL
+
+/*
+ * copy the rest, by bytes
+ */
+       JEQ     _return                 /* flags set by above ANDL */
+c3f:
+       MOVL    BX, CX
+       REP;    MOVSB
+
+       RET
+
+/*
+ * whole thing backwards has
+ * adjusted addresses
+ */
+_back:
+       ADDQ    BX, DI
+       ADDQ    BX, SI
+       STD
+       SUBQ    $4, DI
+       SUBQ    $4, SI
+/*
+ * copy whole longs, if aligned
+ */
+       MOVQ    DI, DX
+       ORQ     SI, DX
+       ANDL    $3, DX
+       JNE     c3b
+       MOVL    BX, CX
+       SHRQ    $2, CX
+       ANDL    $3, BX
+       REP;    MOVSL
+/*
+ * copy the rest, by bytes
+ */
+       JEQ     _return                 /* flags set by above ANDL */
+
+c3b:
+       ADDQ    $3, DI
+       ADDQ    $3, SI
+       MOVL    BX, CX
+       REP;    MOVSB
+
+_return:
+       RET
diff --git a/sys/src/libc/amd64/memset.s b/sys/src/libc/amd64/memset.s
new file mode 100644 (file)
index 0000000..d190ede
--- /dev/null
@@ -0,0 +1,41 @@
+       TEXT    memset(SB),$0
+
+       CLD
+       MOVQ    RARG, DI
+       MOVBLZX c+8(FP), AX
+       MOVL    n+16(FP), BX
+/*
+ * if not enough bytes, just set bytes
+ */
+       CMPL    BX, $9
+       JLS     c3
+/*
+ * if not aligned, just set bytes
+ */
+       MOVQ    RARG, CX
+       ANDL    $3,CX
+       JNE     c3
+/*
+ * build word in AX
+ */
+       MOVB    AL, AH
+       MOVL    AX, CX
+       SHLL    $16, CX
+       ORL     CX, AX
+/*
+ * set whole longs
+ */
+c1:
+       MOVQ    BX, CX
+       SHRQ    $2, CX
+       ANDL    $3, BX
+       REP;    STOSL
+/*
+ * set the rest, by bytes
+ */
+c3:
+       MOVL    BX, CX
+       REP;    STOSB
+ret:
+       MOVQ    RARG,AX
+       RET
diff --git a/sys/src/libc/amd64/mkfile b/sys/src/libc/amd64/mkfile
new file mode 100644 (file)
index 0000000..81aa744
--- /dev/null
@@ -0,0 +1,41 @@
+objtype=amd64
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libc.a
+SFILES=\
+       argv0.s\
+       atom.s\
+       cycles.s\
+       getfcr.s\
+       main9.s\
+       main9p.s\
+       memccpy.s\
+       memchr.s\
+       memcmp.s\
+       memcpy.s\
+       memmove.s\
+       memset.s\
+       muldiv.s\
+       setjmp.s\
+       sqrt.s\
+       strcat.s\
+       strchr.s\
+       strcpy.s\
+       strlen.s\
+       tas.s\
+
+CFILES=\
+       _seek.c\
+       getcallerpc.c\
+       notejmp.c\
+
+HFILES=/sys/include/libc.h
+
+OFILES=${CFILES:%.c=%.$O} ${SFILES:%.s=%.$O}
+
+UPDATE=mkfile\
+       $HFILES\
+       $CFILES\
+       $SFILES\
+
+</sys/src/cmd/mksyslib
diff --git a/sys/src/libc/amd64/muldiv.s b/sys/src/libc/amd64/muldiv.s
new file mode 100644 (file)
index 0000000..be67302
--- /dev/null
@@ -0,0 +1,12 @@
+TEXT   umuldiv(SB), $0
+       MOVL    RARG, AX
+       MULL    b+8(FP)
+       DIVL    c+16(FP)
+       RET
+
+TEXT   muldiv(SB), $0
+       MOVL    RARG, AX
+       IMULL   b+8(FP)
+       IDIVL   c+16(FP)
+       RET
+       END
diff --git a/sys/src/libc/amd64/notejmp.c b/sys/src/libc/amd64/notejmp.c
new file mode 100644 (file)
index 0000000..100c41e
--- /dev/null
@@ -0,0 +1,16 @@
+#include <u.h>
+#include <libc.h>
+#include <ureg.h>
+
+void
+notejmp(void *vr, jmp_buf j, int ret)
+{
+       struct Ureg *r = vr;
+
+       r->ax = ret;
+       if(ret == 0)
+               r->ax = 1;
+       r->pc = j[JMPBUFPC];
+       r->sp = j[JMPBUFSP] + 8;
+       noted(NCONT);
+}
diff --git a/sys/src/libc/amd64/setjmp.s b/sys/src/libc/amd64/setjmp.s
new file mode 100644 (file)
index 0000000..520e05c
--- /dev/null
@@ -0,0 +1,17 @@
+TEXT   longjmp(SB), $0
+       MOVL    r+8(FP), AX
+       CMPL    AX, $0
+       JNE     ok              /* ansi: "longjmp(0) => longjmp(1)" */
+       MOVL    $1, AX          /* bless their pointed heads */
+ok:
+       MOVQ    0(RARG), SP     /* restore sp */
+       MOVQ    8(RARG), BX     /* put return pc on the stack */
+       MOVQ    BX, 0(SP)
+       RET
+
+TEXT   setjmp(SB), $0
+       MOVQ    SP, 0(RARG)     /* store sp */
+       MOVQ    0(SP), BX       /* store return pc */
+       MOVQ    BX, 8(RARG)
+       MOVL    $0, AX          /* return 0 */
+       RET
diff --git a/sys/src/libc/amd64/sqrt.s b/sys/src/libc/amd64/sqrt.s
new file mode 100644 (file)
index 0000000..6c8338b
--- /dev/null
@@ -0,0 +1,4 @@
+TEXT   sqrt(SB), $0
+       MOVSD   a+0(FP), X0
+       SQRTSD  X0, X0
+       RET
diff --git a/sys/src/libc/amd64/strcat.s b/sys/src/libc/amd64/strcat.s
new file mode 100644 (file)
index 0000000..ff0339d
--- /dev/null
@@ -0,0 +1,48 @@
+       TEXT    strcat(SB),$0
+
+       MOVL    $0, AX
+       MOVQ    $-1, CX
+       CLD
+
+/*
+ * find length of second string
+ */
+
+       MOVQ    p2+8(FP), DI
+       REPN;   SCASB
+
+       MOVQ    DI, BX
+       SUBQ    p2+8(FP), BX
+
+/*
+ * find end of first string
+ */
+
+       MOVQ    RARG, DI
+       REPN;   SCASB
+
+/*
+ * copy the memory
+ */
+       SUBQ    $1, DI
+       MOVQ    p2+8(FP), SI
+/*
+ * copy whole longs, if aligned
+ */
+       MOVQ    DI, CX
+       ORQ     SI, CX
+       ANDL    $3, CX
+       JNE     c3
+       MOVQ    BX, CX
+       SHRQ    $2, CX
+       REP;    MOVSL
+/*
+ * copy the rest, by bytes
+ */
+       ANDL    $3, BX
+c3:
+       MOVQ    BX, CX
+       REP;    MOVSB
+
+       MOVQ    RARG, AX
+       RET
diff --git a/sys/src/libc/amd64/strchr.s b/sys/src/libc/amd64/strchr.s
new file mode 100644 (file)
index 0000000..3175373
--- /dev/null
@@ -0,0 +1,38 @@
+       TEXT    strchr(SB), $0
+
+       MOVQ    RARG, DI
+       MOVB    c+8(FP), AX
+       CMPB    AX, $0
+       JEQ     l2      /**/
+
+/*
+ * char is not null
+ */
+l1:
+       MOVB    (DI), BX
+       CMPB    BX, $0
+       JEQ     ret0
+       ADDQ    $1, DI
+       CMPB    AX, BX
+       JNE     l1
+
+       MOVQ    DI, AX
+       SUBQ    $1, AX
+       RET
+
+/*
+ * char is null
+ */
+l2:
+       MOVQ    $-1, CX
+       CLD
+
+       REPN;   SCASB
+
+       MOVQ    DI, AX
+       SUBQ    $1, AX
+       RET
+
+ret0:
+       MOVQ    $0, AX
+       RET
diff --git a/sys/src/libc/amd64/strcpy.s b/sys/src/libc/amd64/strcpy.s
new file mode 100644 (file)
index 0000000..850ceb8
--- /dev/null
@@ -0,0 +1,40 @@
+       TEXT    strcpy(SB),$0
+
+       MOVL    $0, AX
+       MOVQ    $-1, CX
+       CLD
+/*
+ * find end of second string
+ */
+
+       MOVQ    p2+8(FP), DI
+       REPN;   SCASB
+
+       MOVQ    DI, BX
+       SUBQ    p2+8(FP), BX
+
+/*
+ * copy the memory
+ */
+       MOVQ    RARG, DI
+       MOVQ    p2+8(FP), SI
+/*
+ * copy whole longs, if aligned
+ */
+       MOVQ    DI, CX
+       ORQ             SI, CX
+       ANDL    $3, CX
+       JNE     c3
+       MOVQ    BX, CX
+       SHRQ    $2, CX
+       REP;    MOVSL
+/*
+ * copy the rest, by bytes
+ */
+       ANDL    $3, BX
+c3:
+       MOVL    BX, CX
+       REP;    MOVSB
+
+       MOVQ    RARG, AX
+       RET
diff --git a/sys/src/libc/amd64/strlen.s b/sys/src/libc/amd64/strlen.s
new file mode 100644 (file)
index 0000000..cf27cd5
--- /dev/null
@@ -0,0 +1,16 @@
+       TEXT    strlen(SB),$0
+
+       MOVL    $0, AX
+       MOVQ    $-1, CX
+       CLD
+/*
+ * look for end of string
+ */
+
+       MOVQ    RARG, DI
+       REPN;   SCASB
+
+       MOVQ    DI, AX
+       SUBQ    RARG, AX
+       SUBQ    $1, AX
+       RET
diff --git a/sys/src/libc/amd64/tas.s b/sys/src/libc/amd64/tas.s
new file mode 100644 (file)
index 0000000..571828d
--- /dev/null
@@ -0,0 +1,5 @@
+TEXT   _tas(SB), 1, $0
+
+       MOVL    $0xdeaddead,AX
+       XCHGL   AX,(RARG)
+       RET
diff --git a/sys/src/libmp/amd64/mkfile b/sys/src/libmp/amd64/mkfile
new file mode 100644 (file)
index 0000000..c9ecdb0
--- /dev/null
@@ -0,0 +1,20 @@
+objtype=amd64
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libmp.a
+SFILES=\
+       mpdigdiv.s\
+       mpvecadd.s\
+       mpvecdigmuladd.s\
+       mpvecdigmulsub.s\
+       mpvecsub.s\
+
+HFILES=/$objtype/include/u.h /sys/include/mp.h ../port/dat.h
+
+OFILES=${SFILES:%.s=%.$O}
+
+UPDATE=mkfile\
+       $HFILES\
+       $SFILES\
+
+</sys/src/cmd/mksyslib
diff --git a/sys/src/libmp/amd64/mpdigdiv.s b/sys/src/libmp/amd64/mpdigdiv.s
new file mode 100644 (file)
index 0000000..6025d14
--- /dev/null
@@ -0,0 +1,21 @@
+TEXT   mpdigdiv(SB),$0
+
+/*     MOVL    dividend+0(FP),BX */
+       MOVL    0(RARG),AX
+       MOVL    4(RARG),DX
+       MOVL    divisor+8(FP),BX
+       MOVQ    quotient+16(FP),DI
+       XORL    CX,CX
+       CMPL    DX,BX           /* dividend >= 2^32 * divisor */
+       JHS     _divovfl
+       CMPL    BX,CX           /* divisor == 0 */
+       JE      _divovfl
+       DIVL    BX              /* AX = DX:AX/BX */
+       MOVL    AX,0(DI)
+       RET
+
+       /* return all 1's */
+_divovfl:
+       NOTL    CX
+       MOVL    CX,0(DI)
+       RET
diff --git a/sys/src/libmp/amd64/mpvecadd.s b/sys/src/libmp/amd64/mpvecadd.s
new file mode 100644 (file)
index 0000000..326f39d
--- /dev/null
@@ -0,0 +1,54 @@
+/*
+ *     mpvecadd(mpdigit *a, int alen, mpdigit *b, int blen, mpdigit *sum)
+ *
+ *             sum[0:alen] = a[0:alen-1] + b[0:blen-1]
+ *
+ *     prereq: alen >= blen, sum has room for alen+1 digits
+ */
+TEXT   mpvecadd(SB),$0
+
+       MOVL    alen+8(FP),DX
+       MOVL    blen+24(FP),CX
+/*     MOVL    a+0(FP),SI */
+       MOVQ    RARG, SI
+       MOVQ    b+16(FP),BX
+       SUBL    CX,DX
+       MOVQ    sum+32(FP),DI
+       XORL    BP,BP                   /* this also sets carry to 0 */
+
+       /* skip addition if b is zero */
+       TESTL   CX,CX
+       JZ      _add1
+
+       /* sum[0:blen-1],carry = a[0:blen-1] + b[0:blen-1] */
+_addloop1:
+       MOVL    (SI)(BP*4), AX
+       ADCL    (BX)(BP*4), AX
+       MOVL    AX,(DI)(BP*4)
+       INCL    BP
+       LOOP    _addloop1
+
+_add1:
+       /* jump if alen > blen */
+       INCL    DX
+       MOVL    DX,CX
+       LOOP    _addloop2
+
+       /* sum[alen] = carry */
+_addend:
+       JC      _addcarry
+       MOVL    $0,(DI)(BP*4)
+       RET
+_addcarry:
+       MOVL    $1,(DI)(BP*4)
+       RET
+
+       /* sum[blen:alen-1],carry = a[blen:alen-1] + 0 */
+_addloop2:
+       MOVL    (SI)(BP*4),AX
+       ADCL    $0,AX
+       MOVL    AX,(DI)(BP*4)
+       INCL    BP
+       LOOP    _addloop2
+       JMP     _addend
+
diff --git a/sys/src/libmp/amd64/mpvecdigmuladd.s b/sys/src/libmp/amd64/mpvecdigmuladd.s
new file mode 100644 (file)
index 0000000..6599a42
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ *     mpvecdigmul(mpdigit *b, int n, mpdigit m, mpdigit *p)
+ *
+ *     p += b*m
+ *
+ *     each step look like:
+ *             hi,lo = m*b[i]
+ *             lo += oldhi + carry
+ *             hi += carry
+ *             p[i] += lo
+ *             oldhi = hi
+ *
+ *     the registers are:
+ *             hi = DX         - constrained by hardware
+ *             lo = AX         - constrained by hardware
+ *             b+n = SI        - can't be BP
+ *             p+n = DI        - can't be BP
+ *             i-n = BP
+ *             m = BX
+ *             oldhi = CX
+ *             
+ */
+TEXT   mpvecdigmuladd(SB),$0
+
+/*     MOVQ    b+0(FP),SI      */
+       MOVQ    RARG,SI
+       MOVL    n+8(FP),CX
+       MOVL    m+16(FP),BX
+       MOVQ    p+24(FP),DI
+       MOVL    CX,BP
+       NEGQ    BP              /* BP = -n */
+       SHLL    $2,CX
+       ADDQ    CX,SI           /* SI = b + n */
+       ADDQ    CX,DI           /* DI = p + n */
+       XORL    CX,CX
+_muladdloop:
+       MOVL    (SI)(BP*4),AX   /* lo = b[i] */
+       MULL    BX              /* hi, lo = b[i] * m */
+       ADDL    CX,AX           /* lo += oldhi */
+       JCC     _muladdnocarry1
+       INCL    DX              /* hi += carry */
+_muladdnocarry1:
+       ADDL    AX,(DI)(BP*4)   /* p[i] += lo */
+       JCC     _muladdnocarry2
+       INCL    DX              /* hi += carry */
+_muladdnocarry2:
+       MOVL    DX,CX           /* oldhi = hi */
+       INCQ    BP              /* i++ */
+       JNZ     _muladdloop
+       XORL    AX,AX
+       ADDL    CX,(DI)(BP*4)   /* p[n] + oldhi */
+       ADCL    AX,AX           /* return carry out of p[n] */
+       RET
diff --git a/sys/src/libmp/amd64/mpvecdigmulsub.s b/sys/src/libmp/amd64/mpvecdigmulsub.s
new file mode 100644 (file)
index 0000000..0b5a357
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ *     mpvecdigmulsub(mpdigit *b, int n, mpdigit m, mpdigit *p)
+ *
+ *     p -= b*m
+ *
+ *     each step look like:
+ *             hi,lo = m*b[i]
+ *             lo += oldhi + carry
+ *             hi += carry
+ *             p[i] += lo
+ *             oldhi = hi
+ *
+ *     the registers are:
+ *             hi = DX         - constrained by hardware
+ *             lo = AX         - constrained by hardware
+ *             b = SI          - can't be BP
+ *             p = DI          - can't be BP
+ *             i = BP
+ *             n = CX          - constrained by LOOP instr
+ *             m = BX
+ *             oldhi = R8
+ *             
+ */
+TEXT   mpvecdigmulsub(SB),$0
+
+/*     MOVL    b+0(FP),SI      */
+       MOVQ    RARG,SI
+       MOVL    n+8(FP),CX
+       MOVL    m+16(FP),BX
+       MOVQ    p+24(FP),DI
+       XORL    BP,BP
+       MOVL    BP,R8
+_mulsubloop:
+       MOVL    (SI)(BP*4),AX           /* lo = b[i] */
+       MULL    BX                      /* hi, lo = b[i] * m */
+       ADDL    R8,AX           /* lo += oldhi */
+       JCC     _mulsubnocarry1
+       INCL    DX                      /* hi += carry */
+_mulsubnocarry1:
+       SUBL    AX,(DI)(BP*4)
+       JCC     _mulsubnocarry2
+       INCL    DX                      /* hi += carry */
+_mulsubnocarry2:
+       MOVL    DX,R8
+       INCL    BP
+       LOOP    _mulsubloop
+       SUBL    R8,(DI)(BP*4)
+       JCC     _mulsubnocarry3
+       MOVQ    $-1,AX
+       RET
+_mulsubnocarry3:
+       MOVQ    $1,AX
+       RET
diff --git a/sys/src/libmp/amd64/mpvecsub.s b/sys/src/libmp/amd64/mpvecsub.s
new file mode 100644 (file)
index 0000000..9e1b534
--- /dev/null
@@ -0,0 +1,45 @@
+/*
+ *     mpvecsub(mpdigit *a, int alen, mpdigit *b, int blen, mpdigit *diff)
+ *
+ *             diff[0:alen-1] = a[0:alen-1] - b[0:blen-1]
+ *
+ *     prereq: alen >= blen, diff has room for alen digits
+ */
+TEXT   mpvecsub(SB),$0
+
+/*     MOVQ    a+0(FP),SI */
+       MOVQ    RARG, SI
+       MOVQ    b+16(FP),BX
+       MOVL    alen+8(FP),DX
+       MOVL    blen+24(FP),CX
+       MOVQ    diff+32(FP),DI
+       SUBL    CX,DX
+       XORL    BP,BP                   /* this also sets carry to 0 */
+
+       /* skip subraction if b is zero */
+       TESTL   CX,CX
+       JZ      _sub1
+
+       /* diff[0:blen-1],borrow = a[0:blen-1] - b[0:blen-1] */
+_subloop1:
+       MOVL    (SI)(BP*4),AX
+       SBBL    (BX)(BP*4),AX
+       MOVL    AX,(DI)(BP*4)
+       INCL    BP
+       LOOP    _subloop1
+
+_sub1:
+       INCL    DX
+       MOVL    DX,CX
+       LOOP    _subloop2
+       RET
+
+       /* diff[blen:alen-1] = a[blen:alen-1] - 0 */
+_subloop2:
+       MOVL    (SI)(BP*4),AX
+       SBBL    $0,AX
+       MOVL    AX,(DI)(BP*4)
+       INCL    BP
+       LOOP    _subloop2
+       RET
+
diff --git a/sys/src/libsec/amd64/md5block.s b/sys/src/libsec/amd64/md5block.s
new file mode 100644 (file)
index 0000000..4e0d008
--- /dev/null
@@ -0,0 +1,212 @@
+/*
+ *  rfc1321 requires that I include this.  The code is new.  The constants
+ *  all come from the rfc (hence the copyright).  We trade a table for the
+ *  macros in rfc.  The total size is a lot less. -- presotto
+ *
+ *     Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+ *     rights reserved.
+ *
+ *     License to copy and use this software is granted provided that it
+ *     is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ *     Algorithm" in all material mentioning or referencing this software
+ *     or this function.
+ *
+ *     License is also granted to make and use derivative works provided
+ *     that such works are identified as "derived from the RSA Data
+ *     Security, Inc. MD5 Message-Digest Algorithm" in all material
+ *     mentioning or referencing the derived work.
+ *
+ *     RSA Data Security, Inc. makes no representations concerning either
+ *     the merchantability of this software or the suitability of this
+ *     software forany particular purpose. It is provided "as is"
+ *     without express or implied warranty of any kind.
+ *     These notices must be retained in any copies of any part of this
+ *     documentation and/or software.
+ */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+
+/*
+ * SI is data
+ *     a += FN(B,C,D);
+ *     a += x[sh] + t[sh];
+ *     a = (a << S11) | (a >> (32 - S11));
+ *     a += b;
+ */
+
+#define BODY1(off,V,FN,SH,A,B,C,D)\
+       FN(B,C,D)\
+       LEAL V(A)(DI*1),A;\
+       ADDL (off)(BP),A;\
+       ROLL $SH,A;\
+       ADDL B,A;\
+
+#define BODY(off,V,FN,SH,A,B,C,D)\
+       FN(B,C,D)\
+       LEAL V(A)(DI*1),A;\
+       ADDL (off)(BP),A;\
+       ROLL $SH,A;\
+       ADDL B,A;\
+
+/*
+ * fn1 = ((c ^ d) & b) ^ d
+ */
+#define FN1(B,C,D)\
+       MOVL C,DI;\
+       XORL D,DI;\
+       ANDL B,DI;\
+       XORL D,DI;\
+
+/*
+ * fn2 = ((b ^ c) & d) ^ c;
+ */
+#define FN2(B,C,D)\
+       MOVL B,DI;\
+       XORL C,DI;\
+       ANDL D,DI;\
+       XORL C,DI;\
+
+/*
+ * fn3 = b ^ c ^ d;
+ */
+#define FN3(B,C,D)\
+       MOVL B,DI;\
+       XORL C,DI;\
+       XORL D,DI;\
+
+/*
+ * fn4 = c ^ (b | ~d);
+ */
+#define FN4(B,C,D)\
+       MOVL D,DI;\
+       XORL $-1,DI;\
+       ORL B,DI;\
+       XORL C,DI;\
+
+#define        LEN     8
+#define        STATE   16
+
+TEXT   _md5block+0(SB), $0
+
+       MOVQ    RARG,R8
+       MOVLQZX len+LEN(FP),BX
+       ADDQ    BX,R8
+
+mainloop:
+       MOVQ state+STATE(FP),SI
+       MOVL (SI),AX
+       MOVL 4(SI),BX
+       MOVL 8(SI),CX
+       MOVL 12(SI),DX
+
+       BODY1( 0*4,0xd76aa478,FN1,S11,AX,BX,CX,DX)
+       BODY1( 1*4,0xe8c7b756,FN1,S12,DX,AX,BX,CX)
+       BODY1( 2*4,0x242070db,FN1,S13,CX,DX,AX,BX)
+       BODY1( 3*4,0xc1bdceee,FN1,S14,BX,CX,DX,AX)
+
+       BODY1( 4*4,0xf57c0faf,FN1,S11,AX,BX,CX,DX)
+       BODY1( 5*4,0x4787c62a,FN1,S12,DX,AX,BX,CX)
+       BODY1( 6*4,0xa8304613,FN1,S13,CX,DX,AX,BX)
+       BODY1( 7*4,0xfd469501,FN1,S14,BX,CX,DX,AX)
+
+       BODY1( 8*4,0x698098d8,FN1,S11,AX,BX,CX,DX)
+       BODY1( 9*4,0x8b44f7af,FN1,S12,DX,AX,BX,CX)
+       BODY1(10*4,0xffff5bb1,FN1,S13,CX,DX,AX,BX)
+       BODY1(11*4,0x895cd7be,FN1,S14,BX,CX,DX,AX)
+
+       BODY1(12*4,0x6b901122,FN1,S11,AX,BX,CX,DX)
+       BODY1(13*4,0xfd987193,FN1,S12,DX,AX,BX,CX)
+       BODY1(14*4,0xa679438e,FN1,S13,CX,DX,AX,BX)
+       BODY1(15*4,0x49b40821,FN1,S14,BX,CX,DX,AX)
+
+
+       BODY( 1*4,0xf61e2562,FN2,S21,AX,BX,CX,DX)
+       BODY( 6*4,0xc040b340,FN2,S22,DX,AX,BX,CX)
+       BODY(11*4,0x265e5a51,FN2,S23,CX,DX,AX,BX)
+       BODY( 0*4,0xe9b6c7aa,FN2,S24,BX,CX,DX,AX)
+
+       BODY( 5*4,0xd62f105d,FN2,S21,AX,BX,CX,DX)
+       BODY(10*4,0x02441453,FN2,S22,DX,AX,BX,CX)
+       BODY(15*4,0xd8a1e681,FN2,S23,CX,DX,AX,BX)
+       BODY( 4*4,0xe7d3fbc8,FN2,S24,BX,CX,DX,AX)
+
+       BODY( 9*4,0x21e1cde6,FN2,S21,AX,BX,CX,DX)
+       BODY(14*4,0xc33707d6,FN2,S22,DX,AX,BX,CX)
+       BODY( 3*4,0xf4d50d87,FN2,S23,CX,DX,AX,BX)
+       BODY( 8*4,0x455a14ed,FN2,S24,BX,CX,DX,AX)
+
+       BODY(13*4,0xa9e3e905,FN2,S21,AX,BX,CX,DX)
+       BODY( 2*4,0xfcefa3f8,FN2,S22,DX,AX,BX,CX)
+       BODY( 7*4,0x676f02d9,FN2,S23,CX,DX,AX,BX)
+       BODY(12*4,0x8d2a4c8a,FN2,S24,BX,CX,DX,AX)
+
+
+       BODY( 5*4,0xfffa3942,FN3,S31,AX,BX,CX,DX)
+       BODY( 8*4,0x8771f681,FN3,S32,DX,AX,BX,CX)
+       BODY(11*4,0x6d9d6122,FN3,S33,CX,DX,AX,BX)
+       BODY(14*4,0xfde5380c,FN3,S34,BX,CX,DX,AX)
+
+       BODY( 1*4,0xa4beea44,FN3,S31,AX,BX,CX,DX)
+       BODY( 4*4,0x4bdecfa9,FN3,S32,DX,AX,BX,CX)
+       BODY( 7*4,0xf6bb4b60,FN3,S33,CX,DX,AX,BX)
+       BODY(10*4,0xbebfbc70,FN3,S34,BX,CX,DX,AX)
+
+       BODY(13*4,0x289b7ec6,FN3,S31,AX,BX,CX,DX)
+       BODY( 0*4,0xeaa127fa,FN3,S32,DX,AX,BX,CX)
+       BODY( 3*4,0xd4ef3085,FN3,S33,CX,DX,AX,BX)
+       BODY( 6*4,0x04881d05,FN3,S34,BX,CX,DX,AX)
+
+       BODY( 9*4,0xd9d4d039,FN3,S31,AX,BX,CX,DX)
+       BODY(12*4,0xe6db99e5,FN3,S32,DX,AX,BX,CX)
+       BODY(15*4,0x1fa27cf8,FN3,S33,CX,DX,AX,BX)
+       BODY( 2*4,0xc4ac5665,FN3,S34,BX,CX,DX,AX)
+
+
+       BODY( 0*4,0xf4292244,FN4,S41,AX,BX,CX,DX)
+       BODY( 7*4,0x432aff97,FN4,S42,DX,AX,BX,CX)
+       BODY(14*4,0xab9423a7,FN4,S43,CX,DX,AX,BX)
+       BODY( 5*4,0xfc93a039,FN4,S44,BX,CX,DX,AX)
+
+       BODY(12*4,0x655b59c3,FN4,S41,AX,BX,CX,DX)
+       BODY( 3*4,0x8f0ccc92,FN4,S42,DX,AX,BX,CX)
+       BODY(10*4,0xffeff47d,FN4,S43,CX,DX,AX,BX)
+       BODY( 1*4,0x85845dd1,FN4,S44,BX,CX,DX,AX)
+
+       BODY( 8*4,0x6fa87e4f,FN4,S41,AX,BX,CX,DX)
+       BODY(15*4,0xfe2ce6e0,FN4,S42,DX,AX,BX,CX)
+       BODY( 6*4,0xa3014314,FN4,S43,CX,DX,AX,BX)
+       BODY(13*4,0x4e0811a1,FN4,S44,BX,CX,DX,AX)
+
+       BODY( 4*4,0xf7537e82,FN4,S41,AX,BX,CX,DX)
+       BODY(11*4,0xbd3af235,FN4,S42,DX,AX,BX,CX)
+       BODY( 2*4,0x2ad7d2bb,FN4,S43,CX,DX,AX,BX)
+       BODY( 9*4,0xeb86d391,FN4,S44,BX,CX,DX,AX)
+
+       ADDQ $(16*4),BP
+       MOVQ state+STATE(FP),DI
+       ADDL AX,0(DI)
+       ADDL BX,4(DI)
+       ADDL CX,8(DI)
+       ADDL DX,12(DI)
+
+       CMPQ BP,R8
+       JCS mainloop
+
+       RET
diff --git a/sys/src/libsec/amd64/mkfile b/sys/src/libsec/amd64/mkfile
new file mode 100644 (file)
index 0000000..f7948ca
--- /dev/null
@@ -0,0 +1,19 @@
+objtype=amd64
+</$objtype/mkfile
+
+LIB=/$objtype/lib/libsec.a
+FILES=\
+       md5block\
+       sha1block\
+
+HFILES=/sys/include/libsec.h
+
+SFILES=${FILES:%=%.s}
+
+OFILES=${FILES:%=%.$O}
+
+UPDATE=mkfile\
+       $HFILES\
+       $SFILES\
+
+</sys/src/cmd/mksyslib
diff --git a/sys/src/libsec/amd64/sha1block.s b/sys/src/libsec/amd64/sha1block.s
new file mode 100644 (file)
index 0000000..d8283fb
--- /dev/null
@@ -0,0 +1,197 @@
+/* x = (wp[off-f] ^ wp[off-8] ^ wp[off-14] ^ wp[off-16]) <<< 1;
+ * wp[off] = x;
+ * x += A <<< 5;
+ * E += 0xca62c1d6 + x;
+ * x = FN(B,C,D);
+ * E += x;
+ * B >>> 2
+ */
+#define BSWAPDI        BYTE $0x0f; BYTE $0xcf;
+
+#define BODY(off,FN,V,A,B,C,D,E)\
+       MOVL (off-64)(BP),DI;\
+       XORL (off-56)(BP),DI;\
+       XORL (off-32)(BP),DI;\
+       XORL (off-12)(BP),DI;\
+       ROLL $1,DI;\
+       MOVL DI,off(BP);\
+       LEAL V(DI)(E*1),E;\
+       MOVL A,DI;\
+       ROLL $5,DI;\
+       ADDL DI,E;\
+       FN(B,C,D)\
+       ADDL DI,E;\
+       RORL $2,B;\
+
+#define BODY0(off,FN,V,A,B,C,D,E)\
+       MOVLQZX off(BX),DI;\
+       BSWAPDI;\
+       MOVL DI,off(BP);\
+       LEAL V(DI)(E*1),E;\
+       MOVL A,DI;\
+       ROLL $5,DI;\
+       ADDL DI,E;\
+       FN(B,C,D)\
+       ADDL DI,E;\
+       RORL $2,B;\
+
+/*
+ * fn1 = (((C^D)&B)^D);
+ */
+#define FN1(B,C,D)\
+       MOVL C,DI;\
+       XORL D,DI;\
+       ANDL B,DI;\
+       XORL D,DI;\
+
+/*
+ * fn24 = B ^ C ^ D
+ */
+#define FN24(B,C,D)\
+       MOVL B,DI;\
+       XORL C,DI;\
+       XORL D,DI;\
+
+/*
+ * fn3 = ((B ^ C) & (D ^= B)) ^ B
+ * D ^= B to restore D
+ */
+#define FN3(B,C,D)\
+       MOVL B,DI;\
+       XORL C,DI;\
+       XORL B,D;\
+       ANDL D,DI;\
+       XORL B,DI;\
+       XORL B,D;\
+
+/*
+ * stack offsets
+ * void sha1block(uchar *DATA, int LEN, ulong *STATE)
+ */
+#define        DATA    0
+#define        LEN     8
+#define        STATE   16
+
+/*
+ * stack offsets for locals
+ * ulong w[80];
+ * uchar *edata;
+ * ulong *w15, *w40, *w60, *w80;
+ * register local
+ * ulong *wp = BP
+ * ulong a = eax, b = ebx, c = ecx, d = edx, e = esi
+ * ulong tmp = edi
+ */
+#define        Rpdata  R8
+#define WARRAY (-8-(80*4))
+#define TMP1   (-16-(80*4))
+#define TMP2   (-24-(80*4))
+#define W15    (-32-(80*4))
+#define W40    (-40-(80*4))
+#define W60    (-48-(80*4))
+#define W80    (-56-(80*4))
+#define EDATA  (-64-(80*4))
+
+TEXT   _sha1block+0(SB),$384
+
+       MOVQ RARG, Rpdata
+       MOVLQZX len+LEN(FP),BX
+       ADDQ BX, RARG
+       MOVQ RARG,edata+EDATA(SP)
+
+       LEAQ aw15+(WARRAY+15*4)(SP),DI
+       MOVQ DI,w15+W15(SP)
+       LEAQ aw40+(WARRAY+40*4)(SP),DX
+       MOVQ DX,w40+W40(SP)
+       LEAQ aw60+(WARRAY+60*4)(SP),CX
+       MOVQ CX,w60+W60(SP)
+       LEAQ aw80+(WARRAY+80*4)(SP),DI
+       MOVQ DI,w80+W80(SP)
+
+mainloop:
+       LEAQ warray+WARRAY(SP),BP
+
+       MOVQ state+STATE(FP),DI
+       MOVL (DI),AX
+       MOVL 4(DI),BX
+       MOVL BX,tmp1+TMP1(SP)
+       MOVL 8(DI),CX
+       MOVL 12(DI),DX
+       MOVL 16(DI),SI
+
+       MOVQ Rpdata,BX
+
+loop1:
+       BODY0(0,FN1,0x5a827999,AX,tmp1+TMP1(SP),CX,DX,SI)
+       MOVL SI,tmp2+TMP2(SP)
+       BODY0(4,FN1,0x5a827999,SI,AX,tmp1+TMP1(SP),CX,DX)
+       MOVL tmp1+TMP1(SP),SI
+       BODY0(8,FN1,0x5a827999,DX,tmp2+TMP2(SP),AX,SI,CX)
+       BODY0(12,FN1,0x5a827999,CX,DX,tmp2+TMP2(SP),AX,SI)
+       MOVL SI,tmp1+TMP1(SP)
+       BODY0(16,FN1,0x5a827999,SI,CX,DX,tmp2+TMP2(SP),AX)
+       MOVL tmp2+TMP2(SP),SI
+
+       ADDQ $20,BX
+       ADDQ $20,BP
+       CMPQ BP,w15+W15(SP)
+       JCS loop1
+
+       BODY0(0,FN1,0x5a827999,AX,tmp1+TMP1(SP),CX,DX,SI)
+       ADDQ $4,BX
+       MOVQ BX,R8
+       MOVQ tmp1+TMP1(SP),BX
+
+       BODY(4,FN1,0x5a827999,SI,AX,BX,CX,DX)
+       BODY(8,FN1,0x5a827999,DX,SI,AX,BX,CX)
+       BODY(12,FN1,0x5a827999,CX,DX,SI,AX,BX)
+       BODY(16,FN1,0x5a827999,BX,CX,DX,SI,AX)
+
+       ADDQ $20,BP
+
+loop2:
+       BODY(0,FN24,0x6ed9eba1,AX,BX,CX,DX,SI)
+       BODY(4,FN24,0x6ed9eba1,SI,AX,BX,CX,DX)
+       BODY(8,FN24,0x6ed9eba1,DX,SI,AX,BX,CX)
+       BODY(12,FN24,0x6ed9eba1,CX,DX,SI,AX,BX)
+       BODY(16,FN24,0x6ed9eba1,BX,CX,DX,SI,AX)
+
+       ADDQ $20,BP
+       CMPQ BP,w40+W40(SP)
+       JCS loop2
+
+loop3:
+       BODY(0,FN3,0x8f1bbcdc,AX,BX,CX,DX,SI)
+       BODY(4,FN3,0x8f1bbcdc,SI,AX,BX,CX,DX)
+       BODY(8,FN3,0x8f1bbcdc,DX,SI,AX,BX,CX)
+       BODY(12,FN3,0x8f1bbcdc,CX,DX,SI,AX,BX)
+       BODY(16,FN3,0x8f1bbcdc,BX,CX,DX,SI,AX)
+
+       ADDQ $20,BP
+       CMPQ BP,w60+W60(SP)
+       JCS loop3
+
+loop4:
+       BODY(0,FN24,0xca62c1d6,AX,BX,CX,DX,SI)
+       BODY(4,FN24,0xca62c1d6,SI,AX,BX,CX,DX)
+       BODY(8,FN24,0xca62c1d6,DX,SI,AX,BX,CX)
+       BODY(12,FN24,0xca62c1d6,CX,DX,SI,AX,BX)
+       BODY(16,FN24,0xca62c1d6,BX,CX,DX,SI,AX)
+
+       ADDQ $20,BP
+       CMPQ BP,w80+W80(SP)
+       JCS loop4
+
+       MOVQ state+STATE(FP),DI
+       ADDL AX,0(DI)
+       ADDL BX,4(DI)
+       ADDL CX,8(DI)
+       ADDL DX,12(DI)
+       ADDL SI,16(DI)
+
+       MOVQ edata+EDATA(SP),DI
+       CMPQ Rpdata,DI
+       JCS mainloop
+
+       RET
+       END