]> git.lizzy.rs Git - plan9front.git/blob - sys/src/libmemdraw/draw.c
wifi: learn target ip address from neighbor advertisements in dmat proxy
[plan9front.git] / sys / src / libmemdraw / draw.c
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <memdraw.h>
5 #include <pool.h>
6
7 extern Pool* imagmem;
8
9 /* perfect approximation to NTSC = .299r+.587g+.114b when 0 ≤ r,g,b < 256 */
10 #define RGB2K(r,g,b)    ((156763*(r)+307758*(g)+59769*(b))>>19)
11
12 /*
13  * For 16-bit values, x / 255 == (t = x+1, (t+(t>>8)) >> 8).
14  * We add another 127 to round to the nearest value rather
15  * than truncate.
16  *
17  * CALCxy does x bytewise calculations on y input images (x=1,4; y=1,2).
18  * CALC2x does two parallel 16-bit calculations on y input images (y=1,2).
19  */
20 #define CALC11(a, v, tmp) \
21         (tmp=(a)*(v)+128, (tmp+(tmp>>8))>>8)
22
23 #define CALC12(a1, v1, a2, v2, tmp) \
24         (tmp=(a1)*(v1)+(a2)*(v2)+128, (tmp+(tmp>>8))>>8)
25
26 #define MASK 0xFF00FF
27
28 #define CALC21(a, vvuu, tmp) \
29         (tmp=(a)*(vvuu)+0x00800080, ((tmp+((tmp>>8)&MASK))>>8)&MASK)
30
31 #define CALC41(a, rgba, tmp1, tmp2) \
32         (CALC21(a, rgba & MASK, tmp1) | \
33          (CALC21(a, (rgba>>8)&MASK, tmp2)<<8))
34
35 #define CALC22(a1, vvuu1, a2, vvuu2, tmp) \
36         (tmp=(a1)*(vvuu1)+(a2)*(vvuu2)+0x00800080, ((tmp+((tmp>>8)&MASK))>>8)&MASK)
37
38 #define CALC42(a1, rgba1, a2, rgba2, tmp1, tmp2) \
39         (CALC22(a1, rgba1 & MASK, a2, rgba2 & MASK, tmp1) | \
40          (CALC22(a1, (rgba1>>8) & MASK, a2, (rgba2>>8) & MASK, tmp2)<<8))
41
42 static void mktables(void);
43 typedef int Subdraw(Memdrawparam*);
44 static Subdraw chardraw, alphadraw, memoptdraw;
45
46 static Memimage*        memones;
47 static Memimage*        memzeros;
48 Memimage *memwhite;
49 Memimage *memblack;
50 Memimage *memtransparent;
51 Memimage *memopaque;
52
53 int     _ifmt(Fmt*);
54
55 int
56 memimageinit(void)
57 {
58         static int didinit = 0;
59
60         if(didinit)
61                 return 0;
62
63         if(imagmem != nil)
64         if(strcmp(imagmem->name, "Image") == 0 || strcmp(imagmem->name, "image") == 0)
65                 imagmem->move = memimagemove;
66
67         mktables();
68         _memmkcmap();
69
70         fmtinstall('R', Rfmt); 
71         fmtinstall('P', Pfmt);
72         fmtinstall('b', _ifmt);
73
74         memones = allocmemimage(Rect(0,0,1,1), GREY1);
75         memzeros = allocmemimage(Rect(0,0,1,1), GREY1);
76         if(memones == nil || memzeros == nil)
77                 return -1;
78
79         memones->flags |= Frepl;
80         memones->clipr = Rect(-0x3FFFFFF, -0x3FFFFFF, 0x3FFFFFF, 0x3FFFFFF);
81         *byteaddr(memones, ZP) = ~0;
82
83         memzeros->flags |= Frepl;
84         memzeros->clipr = Rect(-0x3FFFFFF, -0x3FFFFFF, 0x3FFFFFF, 0x3FFFFFF);
85         *byteaddr(memzeros, ZP) = 0;
86
87         memwhite = memones;
88         memblack = memzeros;
89         memopaque = memones;
90         memtransparent = memzeros;
91
92         didinit = 1;
93         return 0;
94 }
95
96 static ulong imgtorgba(Memimage*, ulong);
97 static ulong rgbatoimg(Memimage*, ulong);
98 static ulong pixelbits(Memimage*, Point);
99
100 void
101 memimagedraw(Memimage *dst, Rectangle r, Memimage *src, Point p0, Memimage *mask, Point p1, int op)
102 {
103         Memdrawparam par;
104
105         if(mask == nil)
106                 mask = memopaque;
107
108         if(drawclip(dst, &r, src, &p0, mask, &p1, &par.sr, &par.mr) == 0)
109                 return;
110
111         if(op < Clear || op > SoverD)
112                 return;
113
114         par.op = op;
115         par.dst = dst;
116         par.r = r;
117         par.src = src;
118         /* par.sr set by drawclip */
119         par.mask = mask;
120         /* par.mr set by drawclip */
121
122         par.state = 0;
123         if(src->flags&Frepl){
124                 par.state |= Replsrc;
125                 if(Dx(src->r)==1 && Dy(src->r)==1){
126                         par.sval = pixelbits(src, src->r.min);
127                         par.state |= Simplesrc;
128                         par.srgba = imgtorgba(src, par.sval);
129                         par.sdval = rgbatoimg(dst, par.srgba);
130                         if((par.srgba&0xFF) == 0 && (op&DoutS))
131                                 return; /* no-op successfully handled */
132                 }
133         }
134
135         if(mask->flags & Frepl){
136                 par.state |= Replmask;
137                 if(Dx(mask->r)==1 && Dy(mask->r)==1){
138                         par.mval = pixelbits(mask, mask->r.min);
139                         if(par.mval == 0 && (op&DoutS))
140                                 return; /* no-op successfully handled */
141                         par.state |= Simplemask;
142                         if(par.mval == ~0)
143                                 par.state |= Fullmask;
144                         par.mrgba = imgtorgba(mask, par.mval);
145                 }
146         }
147
148         /*
149          * Now that we've clipped the parameters down to be consistent, we 
150          * simply try sub-drawing routines in order until we find one that was able
151          * to handle us.  If the sub-drawing routine returns zero, it means it was
152          * unable to satisfy the request, so we do not return.
153          */
154
155         /*
156          * Hardware support.  Each video driver provides this function,
157          * which checks to see if there is anything it can help with.
158          * There could be an if around this checking to see if dst is in video memory.
159          */
160         if(hwdraw(&par))
161                 return;
162
163         /*
164          * Optimizations using memmove and memset.
165          */
166         if(memoptdraw(&par))
167                 return;
168
169         /*
170          * Character drawing.
171          * Solid source color being painted through a boolean mask onto a high res image.
172          */
173         if(chardraw(&par))
174                 return;
175
176         /*
177          * General calculation-laden case that does alpha for each pixel.
178          */
179         alphadraw(&par);
180 }
181
182
183 /*
184  * Clip the destination rectangle further based on the properties of the 
185  * source and mask rectangles.  Once the destination rectangle is properly
186  * clipped, adjust the source and mask rectangles to be the same size.
187  *
188  * Return zero if the final rectangle is null.
189  */
190 int
191 drawclipnorepl(Memimage *dst, Rectangle *r, Memimage *src, Point *p0, Memimage *mask, Point *p1, Rectangle *sr, Rectangle *mr)
192 {
193         Point rmin, delta;
194         int splitcoords;
195         Rectangle omr;
196
197         if(badrect(*r))
198                 return 0;
199         splitcoords = (p0->x!=p1->x) || (p0->y!=p1->y);
200         /* clip to destination */
201         rmin = r->min;
202         if(!rectclip(r, dst->r) || !rectclip(r, dst->clipr))
203                 return 0;
204         /* move mask point */
205         p1->x += r->min.x-rmin.x;
206         p1->y += r->min.y-rmin.y;
207         /* move source point */
208         p0->x += r->min.x-rmin.x;
209         p0->y += r->min.y-rmin.y;
210         /* map destination rectangle into source */
211         sr->min = *p0;
212         sr->max.x = p0->x+Dx(*r);
213         sr->max.y = p0->y+Dy(*r);
214         /* sr is r in source coordinates; clip to source */
215         if(!(src->flags&Frepl) && !rectclip(sr, src->r))
216                 return 0;
217         if(!rectclip(sr, src->clipr))
218                 return 0;
219         /* compute and clip rectangle in mask */
220         if(splitcoords){
221                 /* move mask point with source */
222                 p1->x += sr->min.x-p0->x;
223                 p1->y += sr->min.y-p0->y;
224                 mr->min = *p1;
225                 mr->max.x = p1->x+Dx(*sr);
226                 mr->max.y = p1->y+Dy(*sr);
227                 omr = *mr;
228                 /* mr is now rectangle in mask; clip it */
229                 if(!(mask->flags&Frepl) && !rectclip(mr, mask->r))
230                         return 0;
231                 if(!rectclip(mr, mask->clipr))
232                         return 0;
233                 /* reflect any clips back to source */
234                 sr->min.x += mr->min.x-omr.min.x;
235                 sr->min.y += mr->min.y-omr.min.y;
236                 sr->max.x += mr->max.x-omr.max.x;
237                 sr->max.y += mr->max.y-omr.max.y;
238         }else{
239                 if(!(mask->flags&Frepl) && !rectclip(sr, mask->r))
240                         return 0;
241                 if(!rectclip(sr, mask->clipr))
242                         return 0;
243                 *mr = *sr;
244         }
245         /* move source clipping back to destination */
246         delta.x = r->min.x - p0->x;
247         delta.y = r->min.y - p0->y;
248         r->min.x = sr->min.x + delta.x;
249         r->min.y = sr->min.y + delta.y;
250         r->max.x = sr->max.x + delta.x;
251         r->max.y = sr->max.y + delta.y;
252         *p0 = sr->min;
253         *p1 = mr->min;
254
255         assert(Dx(*sr) == Dx(*mr) && Dx(*mr) == Dx(*r));
256         assert(Dy(*sr) == Dy(*mr) && Dy(*mr) == Dy(*r));
257         assert(ptinrect(r->min, dst->r));
258
259         return 1;
260 }
261
262 /*
263  * like drawclipnorepl() above, but if source or mask is replicated,
264  * move its clipped rectangle so that its minimum point falls within
265  * the repl rectangle.
266  *
267  * Return zero if the final rectangle is null.
268  */
269 int
270 drawclip(Memimage *dst, Rectangle *r, Memimage *src, Point *p0, Memimage *mask, Point *p1, Rectangle *sr, Rectangle *mr)
271 {
272         Point delta;
273
274         if(!drawclipnorepl(dst, r, src, p0, mask, p1, sr, mr))
275                 return 0;
276
277         /* move source rectangle so sr->min is in src->r */
278         if(src->flags&Frepl) {
279                 delta.x = drawreplxy(src->r.min.x, src->r.max.x, sr->min.x) - sr->min.x;
280                 delta.y = drawreplxy(src->r.min.y, src->r.max.y, sr->min.y) - sr->min.y;
281                 sr->min.x += delta.x;
282                 sr->min.y += delta.y;
283                 sr->max.x += delta.x;
284                 sr->max.y += delta.y;
285                 *p0 = sr->min;
286         }
287
288         /* move mask point so it is in mask->r */
289         *p1 = drawrepl(mask->r, *p1);
290         mr->min = *p1;
291         mr->max.x = p1->x+Dx(*sr);
292         mr->max.y = p1->y+Dy(*sr);
293
294         assert(ptinrect(*p0, src->r));
295         assert(ptinrect(*p1, mask->r));
296
297         return 1;
298 }
299
300 /*
301  * Conversion tables.
302  */
303 static uchar replbit[1+8][256];         /* replbit[x][y] is the replication of the x-bit quantity y to 8-bit depth */
304 static uchar conv18[256][8];            /* conv18[x][y] is the yth pixel in the depth-1 pixel x */
305 static uchar conv28[256][4];            /* ... */
306 static uchar conv48[256][2];
307
308 /*
309  * bitmap of how to replicate n bits to fill 8, for 1 ≤ n ≤ 8.
310  * the X's are where to put the bottom (ones) bit of the n-bit pattern.
311  * only the top 8 bits of the result are actually used.
312  * (the lower 8 bits are needed to get bits in the right place
313  * when n is not a divisor of 8.)
314  *
315  * Should check to see if its easier to just refer to replmul than
316  * use the precomputed values in replbit.  On PCs it may well
317  * be; on machines with slow multiply instructions it probably isn't.
318  */
319 #define a ((((((((((((((((0
320 #define X *2+1)
321 #define _ *2)
322 static int replmul[1+8] = {
323         0,
324         a X X X X X X X X X X X X X X X X,
325         a _ X _ X _ X _ X _ X _ X _ X _ X,
326         a _ _ X _ _ X _ _ X _ _ X _ _ X _,
327         a _ _ _ X _ _ _ X _ _ _ X _ _ _ X,
328         a _ _ _ _ X _ _ _ _ X _ _ _ _ X _,
329         a _ _ _ _ _ X _ _ _ _ _ X _ _ _ _, 
330         a _ _ _ _ _ _ X _ _ _ _ _ _ X _ _,
331         a _ _ _ _ _ _ _ X _ _ _ _ _ _ _ X,
332 };
333 #undef a
334 #undef X
335 #undef _
336
337 static void
338 mktables(void)
339 {
340         int i, j, mask, sh, small;
341                 
342         /* bit replication up to 8 bits */
343         for(i=0; i<256; i++){
344                 for(j=0; j<=8; j++){    /* j <= 8 [sic] */
345                         small = i & ((1<<j)-1);
346                         replbit[j][i] = (small*replmul[j])>>8;
347                 }
348         }
349
350         /* bit unpacking up to 8 bits, only powers of 2 */
351         for(i=0; i<256; i++){
352                 for(j=0, sh=7, mask=1; j<8; j++, sh--)
353                         conv18[i][j] = replbit[1][(i>>sh)&mask];
354
355                 for(j=0, sh=6, mask=3; j<4; j++, sh-=2)
356                         conv28[i][j] = replbit[2][(i>>sh)&mask];
357
358                 for(j=0, sh=4, mask=15; j<2; j++, sh-=4)
359                         conv48[i][j] = replbit[4][(i>>sh)&mask];
360         }
361 }
362
363 static uchar ones = 0xff;
364
365 /*
366  * General alpha drawing case.  Can handle anything.
367  */
368 typedef struct  Buffer  Buffer;
369 struct Buffer {
370         /* used by most routines */
371         uchar   *red;
372         uchar   *grn;
373         uchar   *blu;
374         uchar   *alpha; /* is &ones when unused, never nil */
375         uchar   *grey;
376         ulong   *rgba;
377         int     delta;  /* number of bytes to add to pointer to get next pixel to the right */
378
379         /* used by boolcalc* for mask data */
380         uchar   *m;             /* ptr to mask data r.min byte; like p->bytermin */
381         int             mskip;  /* no. of left bits to skip in *m */
382         uchar   *bm;            /* ptr to mask data img->r.min byte; like p->bytey0s */
383         int             bmskip; /* no. of left bits to skip in *bm */
384         uchar   *em;            /* ptr to mask data img->r.max.x byte; like p->bytey0e */
385         int             emskip; /* no. of right bits to skip in *em */
386 };
387
388 typedef struct  Param   Param;
389 typedef Buffer  Readfn(Param*, uchar*, int);
390 typedef void    Writefn(Param*, uchar*, Buffer);
391 typedef Buffer  Calcfn(Buffer, Buffer, Buffer, int, int, int);
392
393 enum {
394         MAXBCACHE = 16
395 };
396
397 /* giant rathole to customize functions with */
398 struct Param {
399         Readfn  *replcall;
400         Readfn  *greymaskcall;  
401         Readfn  *convreadcall;
402         Writefn *convwritecall;
403
404         Memimage *img;
405         Rectangle       r;
406         int     dx;     /* of r */
407         int     needbuf;
408         int     convgrey;
409         int     alphaonly;
410
411         uchar   *bytey0s;               /* byteaddr(Pt(img->r.min.x, img->r.min.y)) */
412         uchar   *bytermin;      /* byteaddr(Pt(r.min.x, img->r.min.y)) */
413         uchar   *bytey0e;               /* byteaddr(Pt(img->r.max.x, img->r.min.y)) */
414         int             bwidth;
415
416         int     replcache;      /* if set, cache buffers */
417         Buffer  bcache[MAXBCACHE];
418         ulong   bfilled;
419         uchar   *bufbase;
420         int     bufoff;
421         int     bufdelta;
422
423         int     dir;
424
425         int     convbufoff;
426         uchar   *convbuf;
427         Param   *convdpar;
428         int     convdx;
429 };
430
431 static Readfn   greymaskread, replread, readptr;
432 static Writefn  nullwrite;
433 static Calcfn   alphacalc0, alphacalc14, alphacalc2810, alphacalc3679, alphacalc5, alphacalc11, alphacalcS;
434 static Calcfn   boolcalc14, boolcalc236789, boolcalc1011;
435
436 static Readfn*  readfn(Memimage*);
437 static Readfn*  readalphafn(Memimage*);
438 static Writefn* writefn(Memimage*);
439
440 static Calcfn*  boolcopyfn(Memimage*, Memimage*);
441 static Readfn*  convfn(Memimage*, Param*, Memimage*, Param*, int*);
442
443 static Calcfn *alphacalc[Ncomp] = 
444 {
445         alphacalc0,             /* Clear */
446         alphacalc14,            /* DoutS */
447         alphacalc2810,          /* SoutD */
448         alphacalc3679,          /* DxorS */
449         alphacalc14,            /* DinS */
450         alphacalc5,             /* D */
451         alphacalc3679,          /* DatopS */
452         alphacalc3679,          /* DoverS */
453         alphacalc2810,          /* SinD */
454         alphacalc3679,          /* SatopD */
455         alphacalc2810,          /* S */
456         alphacalc11,            /* SoverD */
457 };
458
459 static Calcfn *boolcalc[Ncomp] =
460 {
461         alphacalc0,             /* Clear */
462         boolcalc14,             /* DoutS */
463         boolcalc236789,         /* SoutD */
464         boolcalc236789,         /* DxorS */
465         boolcalc14,             /* DinS */
466         alphacalc5,             /* D */
467         boolcalc236789,         /* DatopS */
468         boolcalc236789,         /* DoverS */
469         boolcalc236789,         /* SinD */
470         boolcalc236789,         /* SatopD */
471         boolcalc1011,           /* S */
472         boolcalc1011,           /* SoverD */
473 };
474
475 /*
476  * Avoid standard Lock, QLock so that can be used in kernel.
477  */
478 typedef struct Dbuf Dbuf;
479 struct Dbuf
480 {
481         uchar *p;
482         int n;
483         Param spar, mpar, dpar;
484         int inuse;
485 };
486 static Dbuf dbuf[10];
487
488 static Dbuf*
489 allocdbuf(void)
490 {
491         int i;
492
493         for(i=0; i<nelem(dbuf); i++){
494                 if(dbuf[i].inuse)
495                         continue;
496                 if(!_tas(&dbuf[i].inuse))
497                         return &dbuf[i];
498         }
499         return nil;
500 }
501
502 static void
503 getparam(Param *p, Memimage *img, Rectangle r, int convgrey, int needbuf, int *ndrawbuf)
504 {
505         int nbuf;
506
507         memset(p, 0, sizeof *p);
508
509         p->img = img;
510         p->r = r;
511         p->dx = Dx(r);
512         p->needbuf = needbuf;
513         p->convgrey = convgrey;
514
515         assert(img->r.min.x <= r.min.x && r.min.x < img->r.max.x);
516
517         p->bytey0s = byteaddr(img, Pt(img->r.min.x, img->r.min.y));
518         p->bytermin = byteaddr(img, Pt(r.min.x, img->r.min.y));
519         p->bytey0e = byteaddr(img, Pt(img->r.max.x, img->r.min.y));
520         p->bwidth = sizeof(ulong)*img->width;
521
522         assert(p->bytey0s <= p->bytermin && p->bytermin <= p->bytey0e);
523
524         if(p->r.min.x == p->img->r.min.x)
525                 assert(p->bytermin == p->bytey0s);
526
527         nbuf = 1;
528         if((img->flags&Frepl) && Dy(img->r) <= MAXBCACHE && Dy(img->r) < Dy(r)){
529                 p->replcache = 1;
530                 nbuf = Dy(img->r);
531         }
532         p->bufdelta = 4*p->dx;
533         p->bufoff = *ndrawbuf;
534         *ndrawbuf += p->bufdelta*nbuf;
535 }
536
537 static void
538 clipy(Memimage *img, int *y)
539 {
540         int dy;
541
542         dy = Dy(img->r);
543         if(*y == dy)
544                 *y = 0;
545         else if(*y == -1)
546                 *y = dy-1;
547         assert(0 <= *y && *y < dy);
548 }
549
550 /*
551  * For each scan line, we expand the pixels from source, mask, and destination
552  * into byte-aligned red, green, blue, alpha, and grey channels.  If buffering is not
553  * needed and the channels were already byte-aligned (grey8, rgb24, rgba32, rgb32),
554  * the readers need not copy the data: they can simply return pointers to the data.
555  * If the destination image is grey and the source is not, it is converted using the NTSC
556  * formula.
557  *
558  * Once we have all the channels, we call either rgbcalc or greycalc, depending on 
559  * whether the destination image is color.  This is allowed to overwrite the dst buffer (perhaps
560  * the actual data, perhaps a copy) with its result.  It should only overwrite the dst buffer
561  * with the same format (i.e. red bytes with red bytes, etc.)  A new buffer is returned from
562  * the calculator, and that buffer is passed to a function to write it to the destination.
563  * If the buffer is already pointing at the destination, the writing function is a no-op.
564  */
565 static int
566 alphadraw(Memdrawparam *par)
567 {
568         int isgrey, starty, endy, op;
569         int needbuf, dsty, srcy, masky;
570         int y, dir, dx, dy, ndrawbuf;
571         uchar *drawbuf;
572         Buffer bsrc, bdst, bmask;
573         Readfn *rdsrc, *rdmask, *rddst;
574         Calcfn *calc;
575         Writefn *wrdst;
576         Memimage *src, *mask, *dst;
577         Rectangle r, sr, mr;
578         Dbuf *z;
579
580         r = par->r;
581         dx = Dx(r);
582         dy = Dy(r);
583
584         z = allocdbuf();
585         if(z == nil)
586                 return 0;
587
588         src = par->src;
589         mask = par->mask;       
590         dst = par->dst;
591         sr = par->sr;
592         mr = par->mr;
593         op = par->op;
594
595         isgrey = dst->flags&Fgrey;
596
597         /*
598          * Buffering when src and dst are the same bitmap is sufficient but not 
599          * necessary.  There are stronger conditions we could use.  We could
600          * check to see if the rectangles intersect, and if simply moving in the
601          * correct y direction can avoid the need to buffer.
602          */
603         needbuf = (src->data == dst->data);
604
605         ndrawbuf = 0;
606         getparam(&z->spar, src, sr, isgrey, needbuf, &ndrawbuf);
607         getparam(&z->dpar, dst, r, isgrey, needbuf, &ndrawbuf);
608         getparam(&z->mpar, mask, mr, 0, needbuf, &ndrawbuf);
609
610         dir = (needbuf && byteaddr(dst, r.min) > byteaddr(src, sr.min)) ? -1 : 1;
611         z->spar.dir = z->mpar.dir = z->dpar.dir = dir;
612
613         /*
614          * If the mask is purely boolean, we can convert from src to dst format
615          * when we read src, and then just copy it to dst where the mask tells us to.
616          * This requires a boolean (1-bit grey) mask and lack of a source alpha channel.
617          *
618          * The computation is accomplished by assigning the function pointers as follows:
619          *      rdsrc - read and convert source into dst format in a buffer
620          *      rdmask - convert mask to bytes, set pointer to it
621          *      rddst - fill with pointer to real dst data, but do no reads
622          *      calc - copy src onto dst when mask says to.
623          *      wrdst - do nothing
624          * This is slightly sleazy, since things aren't doing exactly what their names say,
625          * but it avoids a fair amount of code duplication to make this a case here
626          * rather than have a separate booldraw.
627          */
628         if(!(src->flags&Falpha) && mask->chan == GREY1 && dst->depth >= 8 && op == SoverD){
629                 rdsrc = convfn(dst, &z->dpar, src, &z->spar, &ndrawbuf);
630                 rddst = readptr;
631                 rdmask = readfn(mask);
632                 calc = boolcopyfn(dst, mask);
633                 wrdst = nullwrite;
634         }else{
635                 /* usual alphadraw parameter fetching */
636                 rdsrc = readfn(src);
637                 rddst = readfn(dst);
638                 wrdst = writefn(dst);
639                 calc = alphacalc[op];
640
641                 /*
642                  * If there is no alpha channel, we'll ask for a grey channel
643                  * and pretend it is the alpha.
644                  */
645                 if(mask->flags&Falpha){
646                         rdmask = readalphafn(mask);
647                         z->mpar.alphaonly = 1;
648                 }else{
649                         z->mpar.greymaskcall = readfn(mask);
650                         z->mpar.convgrey = 1;
651                         rdmask = greymaskread;
652
653                         /*
654                          * Should really be above, but then boolcopyfns would have
655                          * to deal with bit alignment, and I haven't written that.
656                          *
657                          * This is a common case for things like ellipse drawing.
658                          * When there's no alpha involved and the mask is boolean,
659                          * we can avoid all the division and multiplication.
660                          */
661                         if(mask->chan == GREY1 && !(src->flags&Falpha))
662                                 calc = boolcalc[op];
663                         else if(op == SoverD && !(src->flags&Falpha))
664                                 calc = alphacalcS;
665                 }
666         }
667
668         /*
669          * If the image has a small enough repl rectangle,
670          * we can just read each line once and cache them.
671          */
672         if(z->spar.replcache){
673                 z->spar.replcall = rdsrc;
674                 rdsrc = replread;
675         }
676         if(z->mpar.replcache){
677                 z->mpar.replcall = rdmask;
678                 rdmask = replread;
679         }
680
681         if(z->n < ndrawbuf){
682                 free(z->p);
683                 if((z->p = mallocz(ndrawbuf, 0)) == nil){
684                         z->inuse = 0;
685                         return 0;
686                 }
687                 z->n = ndrawbuf;
688         }
689         drawbuf = z->p;
690
691         /*
692          * Before we were saving only offsets from drawbuf in the parameter
693          * structures; now that drawbuf has been grown to accomodate us,
694          * we can fill in the pointers.
695          */
696         z->spar.bufbase = drawbuf+z->spar.bufoff;
697         z->mpar.bufbase = drawbuf+z->mpar.bufoff;
698         z->dpar.bufbase = drawbuf+z->dpar.bufoff;
699         z->spar.convbuf = drawbuf+z->spar.convbufoff;
700
701         if(dir == 1){
702                 starty = 0;
703                 endy = dy;
704         }else{
705                 starty = dy-1;
706                 endy = -1;
707         }
708
709         /*
710          * srcy, masky, and dsty are offsets from the top of their
711          * respective Rectangles.  they need to be contained within
712          * the rectangles, so clipy can keep them there without division.
713          */
714         srcy = (starty + sr.min.y - src->r.min.y)%Dy(src->r);
715         masky = (starty + mr.min.y - mask->r.min.y)%Dy(mask->r);
716         dsty = starty + r.min.y - dst->r.min.y;
717
718         assert(0 <= srcy && srcy < Dy(src->r));
719         assert(0 <= masky && masky < Dy(mask->r));
720         assert(0 <= dsty && dsty < Dy(dst->r));
721
722         for(y=starty; y!=endy; y+=dir, srcy+=dir, masky+=dir, dsty+=dir){
723                 clipy(src, &srcy);
724                 clipy(dst, &dsty);
725                 clipy(mask, &masky);
726
727                 bsrc = rdsrc(&z->spar, z->spar.bufbase, srcy);
728                 bmask = rdmask(&z->mpar, z->mpar.bufbase, masky);
729                 bdst = rddst(&z->dpar, z->dpar.bufbase, dsty);
730                 bdst = calc(bdst, bsrc, bmask, dx, isgrey, op);
731                 wrdst(&z->dpar, z->dpar.bytermin+dsty*z->dpar.bwidth, bdst);
732         }
733
734         z->inuse = 0;
735         return 1;
736 }
737
738 static Buffer
739 alphacalc0(Buffer bdst, Buffer b1, Buffer b2, int dx, int grey, int op)
740 {
741         USED(grey);
742         USED(op);
743         USED(b1);
744         USED(b2);
745         memset(bdst.rgba, 0, dx*bdst.delta);
746         return bdst;
747 }
748
749 /*
750  * Do the channels in the buffers match enough
751  * that we can do word-at-a-time operations
752  * on the pixels?
753  */
754 static int
755 chanmatch(Buffer *bdst, Buffer *bsrc)
756 {
757         uchar *drgb, *srgb;
758         
759         /*
760          * first, r, g, b must be in the same place
761          * in the rgba word.
762          */
763         drgb = (uchar*)bdst->rgba;
764         srgb = (uchar*)bsrc->rgba;
765         if(bdst->red - drgb != bsrc->red - srgb
766         || bdst->blu - drgb != bsrc->blu - srgb
767         || bdst->grn - drgb != bsrc->grn - srgb)
768                 return 0;
769         
770         /*
771          * that implies alpha is in the same place,
772          * if it is there at all (it might be == &ones).
773          * if the destination is &ones, we can scribble
774          * over the rgba slot just fine.
775          */
776         if(bdst->alpha == &ones)
777                 return 1;
778         
779         /*
780          * if the destination is not ones but the src is,
781          * then the simultaneous calculation will use
782          * bogus bytes from the src's rgba.  no good.
783          */
784         if(bsrc->alpha == &ones)
785                 return 0;
786         
787         /*
788          * otherwise, alphas are in the same place.
789          */
790         return 1;
791 }
792
793 static Buffer
794 alphacalc14(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
795 {
796         Buffer obdst;
797         int fd, sadelta;
798         int i, sa, ma, q;
799         ulong t, t1;
800
801         obdst = bdst;
802         sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
803         q = bsrc.delta == 4 && bdst.delta == 4 && chanmatch(&bdst, &bsrc);
804
805         for(i=0; i<dx; i++){
806                 sa = *bsrc.alpha;
807                 ma = *bmask.alpha;
808                 fd = CALC11(sa, ma, t);
809                 if(op == DoutS)
810                         fd = 255-fd;
811
812                 if(grey){
813                         *bdst.grey = CALC11(fd, *bdst.grey, t);
814                         bsrc.grey += bsrc.delta;
815                         bdst.grey += bdst.delta;
816                 }else{
817                         if(q){
818                                 *bdst.rgba = CALC41(fd, *bdst.rgba, t, t1);
819                                 bsrc.rgba++;
820                                 bdst.rgba++;
821                                 bsrc.alpha += sadelta;
822                                 bmask.alpha += bmask.delta;
823                                 continue;
824                         }
825                         *bdst.red = CALC11(fd, *bdst.red, t);
826                         *bdst.grn = CALC11(fd, *bdst.grn, t);
827                         *bdst.blu = CALC11(fd, *bdst.blu, t);
828                         bsrc.red += bsrc.delta;
829                         bsrc.blu += bsrc.delta;
830                         bsrc.grn += bsrc.delta;
831                         bdst.red += bdst.delta;
832                         bdst.blu += bdst.delta;
833                         bdst.grn += bdst.delta;
834                 }
835                 if(bdst.alpha != &ones){
836                         *bdst.alpha = CALC11(fd, *bdst.alpha, t);
837                         bdst.alpha += bdst.delta;
838                 }
839                 bmask.alpha += bmask.delta;
840                 bsrc.alpha += sadelta;
841         }
842         return obdst;
843 }
844
845 static Buffer
846 alphacalc2810(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
847 {
848         Buffer obdst;
849         int fs, sadelta;
850         int i, ma, da, q;
851         ulong t, t1;
852
853         obdst = bdst;
854         sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
855         q = bsrc.delta == 4 && bdst.delta == 4 && chanmatch(&bdst, &bsrc);
856
857         for(i=0; i<dx; i++){
858                 ma = *bmask.alpha;
859                 da = *bdst.alpha;
860                 if(op == SoutD)
861                         da = 255-da;
862                 fs = ma;
863                 if(op != S)
864                         fs = CALC11(fs, da, t);
865
866                 if(grey){
867                         *bdst.grey = CALC11(fs, *bsrc.grey, t);
868                         bsrc.grey += bsrc.delta;
869                         bdst.grey += bdst.delta;
870                 }else{
871                         if(q){
872                                 *bdst.rgba = CALC41(fs, *bsrc.rgba, t, t1);
873                                 bsrc.rgba++;
874                                 bdst.rgba++;
875                                 bmask.alpha += bmask.delta;
876                                 bdst.alpha += bdst.delta;
877                                 continue;
878                         }
879                         *bdst.red = CALC11(fs, *bsrc.red, t);
880                         *bdst.grn = CALC11(fs, *bsrc.grn, t);
881                         *bdst.blu = CALC11(fs, *bsrc.blu, t);
882                         bsrc.red += bsrc.delta;
883                         bsrc.blu += bsrc.delta;
884                         bsrc.grn += bsrc.delta;
885                         bdst.red += bdst.delta;
886                         bdst.blu += bdst.delta;
887                         bdst.grn += bdst.delta;
888                 }
889                 if(bdst.alpha != &ones){
890                         *bdst.alpha = CALC11(fs, *bsrc.alpha, t);
891                         bdst.alpha += bdst.delta;
892                 }
893                 bmask.alpha += bmask.delta;
894                 bsrc.alpha += sadelta;
895         }
896         return obdst;
897 }
898
899 static Buffer
900 alphacalc3679(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
901 {
902         Buffer obdst;
903         int fs, fd, sadelta;
904         int i, sa, ma, da, q;
905         ulong t, t1;
906
907         obdst = bdst;
908         sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
909         q = bsrc.delta == 4 && bdst.delta == 4 && chanmatch(&bdst, &bsrc);
910
911         for(i=0; i<dx; i++){
912                 sa = *bsrc.alpha;
913                 ma = *bmask.alpha;
914                 da = *bdst.alpha;
915                 if(op == SatopD)
916                         fs = CALC11(ma, da, t);
917                 else
918                         fs = CALC11(ma, 255-da, t);
919                 if(op == DoverS)
920                         fd = 255;
921                 else{
922                         fd = CALC11(sa, ma, t);
923                         if(op != DatopS)
924                                 fd = 255-fd;
925                 }
926
927                 if(grey){
928                         *bdst.grey = CALC12(fs, *bsrc.grey, fd, *bdst.grey, t);
929                         bsrc.grey += bsrc.delta;
930                         bdst.grey += bdst.delta;
931                 }else{
932                         if(q){
933                                 *bdst.rgba = CALC42(fs, *bsrc.rgba, fd, *bdst.rgba, t, t1);
934                                 bsrc.rgba++;
935                                 bdst.rgba++;
936                                 bsrc.alpha += sadelta;
937                                 bmask.alpha += bmask.delta;
938                                 bdst.alpha += bdst.delta;
939                                 continue;
940                         }
941                         *bdst.red = CALC12(fs, *bsrc.red, fd, *bdst.red, t);
942                         *bdst.grn = CALC12(fs, *bsrc.grn, fd, *bdst.grn, t);
943                         *bdst.blu = CALC12(fs, *bsrc.blu, fd, *bdst.blu, t);
944                         bsrc.red += bsrc.delta;
945                         bsrc.blu += bsrc.delta;
946                         bsrc.grn += bsrc.delta;
947                         bdst.red += bdst.delta;
948                         bdst.blu += bdst.delta;
949                         bdst.grn += bdst.delta;
950                 }
951                 if(bdst.alpha != &ones){
952                         *bdst.alpha = CALC12(fs, sa, fd, da, t);
953                         bdst.alpha += bdst.delta;
954                 }
955                 bmask.alpha += bmask.delta;
956                 bsrc.alpha += sadelta;
957         }
958         return obdst;
959 }
960
961 static Buffer
962 alphacalc5(Buffer bdst, Buffer b1, Buffer b2, int dx, int grey, int op)
963 {
964         USED(dx);
965         USED(grey);
966         USED(op);
967         USED(b1);
968         USED(b2);
969         return bdst;
970 }
971
972 static Buffer
973 alphacalc11(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
974 {
975         Buffer obdst;
976         int fd, sadelta;
977         int i, sa, ma, q;
978         ulong t, t1;
979
980         USED(op);
981         obdst = bdst;
982         sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
983         q = bsrc.delta == 4 && bdst.delta == 4 && chanmatch(&bdst, &bsrc);
984
985         for(i=0; i<dx; i++){
986                 sa = *bsrc.alpha;
987                 ma = *bmask.alpha;
988                 fd = 255-CALC11(sa, ma, t);
989
990                 if(grey){
991                         *bdst.grey = CALC12(ma, *bsrc.grey, fd, *bdst.grey, t);
992                         bsrc.grey += bsrc.delta;
993                         bdst.grey += bdst.delta;
994                 }else{
995                         if(q){
996                                 *bdst.rgba = CALC42(ma, *bsrc.rgba, fd, *bdst.rgba, t, t1);
997                                 bsrc.rgba++;
998                                 bdst.rgba++;
999                                 bsrc.alpha += sadelta;
1000                                 bmask.alpha += bmask.delta;
1001                                 continue;
1002                         }
1003                         *bdst.red = CALC12(ma, *bsrc.red, fd, *bdst.red, t);
1004                         *bdst.grn = CALC12(ma, *bsrc.grn, fd, *bdst.grn, t);
1005                         *bdst.blu = CALC12(ma, *bsrc.blu, fd, *bdst.blu, t);
1006                         bsrc.red += bsrc.delta;
1007                         bsrc.blu += bsrc.delta;
1008                         bsrc.grn += bsrc.delta;
1009                         bdst.red += bdst.delta;
1010                         bdst.blu += bdst.delta;
1011                         bdst.grn += bdst.delta;
1012                 }
1013                 if(bdst.alpha != &ones){
1014                         *bdst.alpha = CALC12(ma, sa, fd, *bdst.alpha, t);
1015                         bdst.alpha += bdst.delta;
1016                 }
1017                 bmask.alpha += bmask.delta;
1018                 bsrc.alpha += sadelta;
1019         }
1020         return obdst;
1021 }
1022
1023 /*
1024 not used yet
1025 source and mask alpha 1
1026 static Buffer
1027 alphacalcS0(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1028 {
1029         Buffer obdst;
1030         int i;
1031
1032         USED(op);
1033         obdst = bdst;
1034         if(bsrc.delta == bdst.delta){
1035                 memmove(bdst.rgba, bsrc.rgba, dx*bdst.delta);
1036                 return obdst;
1037         }
1038         for(i=0; i<dx; i++){
1039                 if(grey){
1040                         *bdst.grey = *bsrc.grey;
1041                         bsrc.grey += bsrc.delta;
1042                         bdst.grey += bdst.delta;
1043                 }else{
1044                         *bdst.red = *bsrc.red;
1045                         *bdst.grn = *bsrc.grn;
1046                         *bdst.blu = *bsrc.blu;
1047                         bsrc.red += bsrc.delta;
1048                         bsrc.blu += bsrc.delta;
1049                         bsrc.grn += bsrc.delta;
1050                         bdst.red += bdst.delta;
1051                         bdst.blu += bdst.delta;
1052                         bdst.grn += bdst.delta;
1053                 }
1054                 if(bdst.alpha != &ones){
1055                         *bdst.alpha = 255;
1056                         bdst.alpha += bdst.delta;
1057                 }
1058         }
1059         return obdst;
1060 }
1061 */
1062
1063 /* source alpha 1 */
1064 static Buffer
1065 alphacalcS(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1066 {
1067         Buffer obdst;
1068         int fd;
1069         int i, ma;
1070         ulong t;
1071
1072         USED(op);
1073         obdst = bdst;
1074
1075         for(i=0; i<dx; i++){
1076                 ma = *bmask.alpha;
1077                 fd = 255-ma;
1078
1079                 if(grey){
1080                         *bdst.grey = CALC12(ma, *bsrc.grey, fd, *bdst.grey, t);
1081                         bsrc.grey += bsrc.delta;
1082                         bdst.grey += bdst.delta;
1083                 }else{
1084                         *bdst.red = CALC12(ma, *bsrc.red, fd, *bdst.red, t);
1085                         *bdst.grn = CALC12(ma, *bsrc.grn, fd, *bdst.grn, t);
1086                         *bdst.blu = CALC12(ma, *bsrc.blu, fd, *bdst.blu, t);
1087                         bsrc.red += bsrc.delta;
1088                         bsrc.blu += bsrc.delta;
1089                         bsrc.grn += bsrc.delta;
1090                         bdst.red += bdst.delta;
1091                         bdst.blu += bdst.delta;
1092                         bdst.grn += bdst.delta;
1093                 }
1094                 if(bdst.alpha != &ones){
1095                         *bdst.alpha = ma+CALC11(fd, *bdst.alpha, t);
1096                         bdst.alpha += bdst.delta;
1097                 }
1098                 bmask.alpha += bmask.delta;
1099         }
1100         return obdst;
1101 }
1102
1103 static Buffer
1104 boolcalc14(Buffer bdst, Buffer b1, Buffer bmask, int dx, int grey, int op)
1105 {
1106         Buffer obdst;
1107         int i, ma, zero;
1108
1109         USED(b1);
1110
1111         obdst = bdst;
1112
1113         for(i=0; i<dx; i++){
1114                 ma = *bmask.alpha;
1115                 zero = ma ? op == DoutS : op == DinS;
1116
1117                 if(grey){
1118                         if(zero)
1119                                 *bdst.grey = 0;
1120                         bdst.grey += bdst.delta;
1121                 }else{
1122                         if(zero)
1123                                 *bdst.red = *bdst.grn = *bdst.blu = 0;
1124                         bdst.red += bdst.delta;
1125                         bdst.blu += bdst.delta;
1126                         bdst.grn += bdst.delta;
1127                 }
1128                 bmask.alpha += bmask.delta;
1129                 if(bdst.alpha != &ones){
1130                         if(zero)
1131                                 *bdst.alpha = 0;
1132                         bdst.alpha += bdst.delta;
1133                 }
1134         }
1135         return obdst;
1136 }
1137
1138 static Buffer
1139 boolcalc236789(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1140 {
1141         Buffer obdst;
1142         int fs, fd;
1143         int i, ma, da, zero;
1144         ulong t;
1145
1146         obdst = bdst;
1147         zero = !(op&1);
1148
1149         for(i=0; i<dx; i++){
1150                 ma = *bmask.alpha;
1151                 da = *bdst.alpha;
1152                 fs = da;
1153                 if(op&2)
1154                         fs = 255-da;
1155                 fd = 0;
1156                 if(op&4)
1157                         fd = 255;
1158
1159                 if(grey){
1160                         if(ma)
1161                                 *bdst.grey = CALC12(fs, *bsrc.grey, fd, *bdst.grey, t);
1162                         else if(zero)
1163                                 *bdst.grey = 0;
1164                         bsrc.grey += bsrc.delta;
1165                         bdst.grey += bdst.delta;
1166                 }else{
1167                         if(ma){
1168                                 *bdst.red = CALC12(fs, *bsrc.red, fd, *bdst.red, t);
1169                                 *bdst.grn = CALC12(fs, *bsrc.grn, fd, *bdst.grn, t);
1170                                 *bdst.blu = CALC12(fs, *bsrc.blu, fd, *bdst.blu, t);
1171                         }
1172                         else if(zero)
1173                                 *bdst.red = *bdst.grn = *bdst.blu = 0;
1174                         bsrc.red += bsrc.delta;
1175                         bsrc.blu += bsrc.delta;
1176                         bsrc.grn += bsrc.delta;
1177                         bdst.red += bdst.delta;
1178                         bdst.blu += bdst.delta;
1179                         bdst.grn += bdst.delta;
1180                 }
1181                 bmask.alpha += bmask.delta;
1182                 if(bdst.alpha != &ones){
1183                         if(ma)
1184                                 *bdst.alpha = fs+CALC11(fd, da, t);
1185                         else if(zero)
1186                                 *bdst.alpha = 0;
1187                         bdst.alpha += bdst.delta;
1188                 }
1189         }
1190         return obdst;
1191 }
1192
1193 static Buffer
1194 boolcalc1011(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1195 {
1196         Buffer obdst;
1197         int i, ma, zero;
1198
1199         obdst = bdst;
1200         zero = !(op&1);
1201
1202         for(i=0; i<dx; i++){
1203                 ma = *bmask.alpha;
1204
1205                 if(grey){
1206                         if(ma)
1207                                 *bdst.grey = *bsrc.grey;
1208                         else if(zero)
1209                                 *bdst.grey = 0;
1210                         bsrc.grey += bsrc.delta;
1211                         bdst.grey += bdst.delta;
1212                 }else{
1213                         if(ma){
1214                                 *bdst.red = *bsrc.red;
1215                                 *bdst.grn = *bsrc.grn;
1216                                 *bdst.blu = *bsrc.blu;
1217                         }
1218                         else if(zero)
1219                                 *bdst.red = *bdst.grn = *bdst.blu = 0;
1220                         bsrc.red += bsrc.delta;
1221                         bsrc.blu += bsrc.delta;
1222                         bsrc.grn += bsrc.delta;
1223                         bdst.red += bdst.delta;
1224                         bdst.blu += bdst.delta;
1225                         bdst.grn += bdst.delta;
1226                 }
1227                 bmask.alpha += bmask.delta;
1228                 if(bdst.alpha != &ones){
1229                         if(ma)
1230                                 *bdst.alpha = 255;
1231                         else if(zero)
1232                                 *bdst.alpha = 0;
1233                         bdst.alpha += bdst.delta;
1234                 }
1235         }
1236         return obdst;
1237 }
1238 /*
1239  * Replicated cached scan line read.  Call the function listed in the Param,
1240  * but cache the result so that for replicated images we only do the work once.
1241  */
1242 static Buffer
1243 replread(Param *p, uchar *s, int y)
1244 {
1245         Buffer *b;
1246
1247         USED(s);
1248         b = &p->bcache[y];
1249         if((p->bfilled & (1<<y)) == 0){
1250                 p->bfilled |= 1<<y;
1251                 *b = p->replcall(p, p->bufbase+y*p->bufdelta, y);
1252         }
1253         return *b;
1254 }
1255
1256 /*
1257  * Alpha reading function that simply relabels the grey pointer.
1258  */
1259 static Buffer
1260 greymaskread(Param *p, uchar *buf, int y)
1261 {
1262         Buffer b;
1263
1264         b = p->greymaskcall(p, buf, y);
1265         b.alpha = b.grey;
1266         return b;
1267 }
1268
1269 static Buffer
1270 readnbit(Param *p, uchar *buf, int y)
1271 {
1272         Buffer b;
1273         Memimage *img;
1274         uchar *repl, *r, *w, *ow, bits;
1275         int i, n, sh, depth, x, dx, npack, nbits;
1276
1277         b.rgba = (ulong*)buf;
1278         b.grey = w = buf;
1279         b.red = b.blu = b.grn = w;
1280         b.alpha = &ones;
1281         b.delta = 1;
1282
1283         dx = p->dx;
1284         img = p->img;
1285         depth = img->depth;
1286         repl = &replbit[depth][0];
1287         npack = 8/depth;
1288         sh = 8-depth;
1289
1290         /* copy from p->r.min.x until end of repl rectangle */
1291         x = p->r.min.x;
1292         n = dx;
1293         if(n > p->img->r.max.x - x)
1294                 n = p->img->r.max.x - x;
1295
1296         r = p->bytermin + y*p->bwidth;
1297         bits = *r++;
1298         nbits = 8;
1299         if(i=x&(npack-1)){
1300                 bits <<= depth*i;
1301                 nbits -= depth*i;
1302         }
1303         for(i=0; i<n; i++){
1304                 if(nbits == 0){
1305                         bits = *r++;
1306                         nbits = 8;
1307                 }
1308                 *w++ = repl[bits>>sh];
1309                 bits <<= depth;
1310                 nbits -= depth;
1311         }
1312         dx -= n;
1313         if(dx == 0)
1314                 return b;
1315
1316         assert(x+i == p->img->r.max.x);
1317
1318         /* copy from beginning of repl rectangle until where we were before. */
1319         x = p->img->r.min.x;
1320         n = dx;
1321         if(n > p->r.min.x - x)
1322                 n = p->r.min.x - x;
1323
1324         r = p->bytey0s + y*p->bwidth;
1325         bits = *r++;
1326         nbits = 8;
1327         if(i=x&(npack-1)){
1328                 bits <<= depth*i;
1329                 nbits -= depth*i;
1330         }
1331         for(i=0; i<n; i++){
1332                 if(nbits == 0){
1333                         bits = *r++;
1334                         nbits = 8;
1335                 }
1336                 *w++ = repl[bits>>sh];
1337                 bits <<= depth;
1338                 nbits -= depth;
1339         }
1340         dx -= n;
1341         if(dx == 0)
1342                 return b;
1343
1344         assert(dx > 0);
1345         /* now we have exactly one full scan line: just replicate the buffer itself until we are done */
1346         ow = buf;
1347         while(dx--)
1348                 *w++ = *ow++;
1349
1350         return b;
1351 }
1352
1353 static void
1354 writenbit(Param *p, uchar *w, Buffer src)
1355 {
1356         uchar *r;
1357         ulong bits;
1358         int i, sh, depth, npack, nbits, x, ex;
1359
1360         assert(src.grey != nil && src.delta == 1);
1361
1362         x = p->r.min.x;
1363         ex = x+p->dx;
1364         depth = p->img->depth;
1365         npack = 8/depth;
1366
1367         i=x&(npack-1);
1368         bits = i ? (*w >> (8-depth*i)) : 0;
1369         nbits = depth*i;
1370         sh = 8-depth;
1371         r = src.grey;
1372
1373         for(; x<ex; x++){
1374                 bits <<= depth;
1375                 bits |= (*r++ >> sh);
1376                 nbits += depth;
1377                 if(nbits == 8){
1378                         *w++ = bits;
1379                         nbits = 0;
1380                 }
1381         }
1382
1383         if(nbits){
1384                 sh = 8-nbits;
1385                 bits <<= sh;
1386                 bits |= *w & ((1<<sh)-1);
1387                 *w = bits;
1388         }
1389         return;
1390 }
1391
1392 static Buffer
1393 readcmap(Param *p, uchar *buf, int y)
1394 {
1395         Buffer b;
1396         int a, convgrey, copyalpha, dx, i, m;
1397         uchar *q, *cmap, *begin, *end, *r, *w;
1398
1399         begin = p->bytey0s + y*p->bwidth;
1400         r = p->bytermin + y*p->bwidth;
1401         end = p->bytey0e + y*p->bwidth;
1402         cmap = p->img->cmap->cmap2rgb;
1403         convgrey = p->convgrey;
1404         copyalpha = (p->img->flags&Falpha) != 0;
1405
1406         w = buf;
1407         dx = p->dx;
1408         if(copyalpha){
1409                 b.alpha = buf++;
1410                 a = p->img->shift[CAlpha]/8;
1411                 m = p->img->shift[CMap]/8;
1412                 for(i=0; i<dx; i++){
1413                         *w++ = r[a];
1414                         q = cmap+r[m]*3;
1415                         r += 2;
1416                         if(r == end)
1417                                 r = begin;
1418                         if(convgrey){
1419                                 *w++ = RGB2K(q[0], q[1], q[2]);
1420                         }else{
1421                                 *w++ = q[2];    /* blue */
1422                                 *w++ = q[1];    /* green */
1423                                 *w++ = q[0];    /* red */
1424                         }
1425                 }
1426         }else{
1427                 b.alpha = &ones;
1428                 for(i=0; i<dx; i++){
1429                         q = cmap+*r++*3;
1430                         if(r == end)
1431                                 r = begin;
1432                         if(convgrey){
1433                                 *w++ = RGB2K(q[0], q[1], q[2]);
1434                         }else{
1435                                 *w++ = q[2];    /* blue */
1436                                 *w++ = q[1];    /* green */
1437                                 *w++ = q[0];    /* red */
1438                         }
1439                 }
1440         }
1441
1442         b.rgba = (ulong*)(buf-copyalpha);
1443
1444         if(convgrey){
1445                 b.grey = buf;
1446                 b.red = b.blu = b.grn = buf;
1447                 b.delta = 1+copyalpha;
1448         }else{
1449                 b.blu = buf;
1450                 b.grn = buf+1;
1451                 b.red = buf+2;
1452                 b.grey = nil;
1453                 b.delta = 3+copyalpha;
1454         }
1455         return b;
1456 }
1457
1458 static void
1459 writecmap(Param *p, uchar *w, Buffer src)
1460 {
1461         uchar *cmap, *red, *grn, *blu, *alpha;
1462         int i, dx, delta, a, m;
1463
1464         cmap = p->img->cmap->rgb2cmap;
1465         
1466         delta = src.delta;
1467         red= src.red;
1468         grn = src.grn;
1469         blu = src.blu;
1470
1471         dx = p->dx;
1472         if(p->img->flags&Falpha){
1473                 alpha = src.alpha;
1474                 m = p->img->shift[CMap]/8;
1475                 a = p->img->shift[CAlpha]/8;
1476                 for(i=0; i<dx; i++, red+=delta, grn+=delta, blu+=delta, w+=2){
1477                         w[a] = *alpha;
1478                         if(alpha != &ones)
1479                                 alpha+=delta;
1480                         w[m] = cmap[(*red>>4)*256+(*grn>>4)*16+(*blu>>4)];
1481                 }
1482         } else {
1483                 for(i=0; i<dx; i++, red+=delta, grn+=delta, blu+=delta)
1484                         *w++ = cmap[(*red>>4)*256+(*grn>>4)*16+(*blu>>4)];
1485         }
1486 }
1487
1488 static Buffer
1489 readbyte(Param *p, uchar *buf, int y)
1490 {
1491         Buffer b;
1492         Memimage *img;
1493         int dx, isgrey, convgrey, alphaonly, copyalpha, i, nb;
1494         uchar *begin, *end, *r, *w, *rrepl, *grepl, *brepl, *arepl, *krepl;
1495         uchar ured, ugrn, ublu;
1496         ulong u;
1497
1498         img = p->img;
1499         begin = p->bytey0s + y*p->bwidth;
1500         r = p->bytermin + y*p->bwidth;
1501         end = p->bytey0e + y*p->bwidth;
1502
1503         w = buf;
1504         dx = p->dx;
1505         nb = img->depth/8;
1506
1507         convgrey = p->convgrey; /* convert rgb to grey */
1508         isgrey = img->flags&Fgrey;
1509         alphaonly = p->alphaonly;
1510         copyalpha = (img->flags&Falpha) != 0;
1511
1512         /* if we can, avoid processing everything */
1513         if(!(img->flags&Frepl) && !convgrey && (img->flags&Fbytes)){
1514                 memset(&b, 0, sizeof b);
1515                 if(p->needbuf){
1516                         memmove(buf, r, dx*nb);
1517                         r = buf;
1518                 }
1519                 b.rgba = (ulong*)r;
1520                 if(copyalpha)
1521                         b.alpha = r+img->shift[CAlpha]/8;
1522                 else
1523                         b.alpha = &ones;
1524                 if(isgrey){
1525                         b.grey = r+img->shift[CGrey]/8;
1526                         b.red = b.grn = b.blu = b.grey;
1527                 }else{
1528                         b.red = r+img->shift[CRed]/8;
1529                         b.grn = r+img->shift[CGreen]/8;
1530                         b.blu = r+img->shift[CBlue]/8;
1531                 }
1532                 b.delta = nb;
1533                 return b;
1534         }
1535
1536         rrepl = replbit[img->nbits[CRed]];
1537         grepl = replbit[img->nbits[CGreen]];
1538         brepl = replbit[img->nbits[CBlue]];
1539         arepl = replbit[img->nbits[CAlpha]];
1540         krepl = replbit[img->nbits[CGrey]];
1541
1542         for(i=0; i<dx; i++){
1543                 u = r[0] | (r[1]<<8) | (r[2]<<16) | (r[3]<<24);
1544                 if(copyalpha) {
1545                         *w++ = arepl[(u>>img->shift[CAlpha]) & img->mask[CAlpha]];
1546                 }
1547
1548                 if(isgrey)
1549                         *w++ = krepl[(u >> img->shift[CGrey]) & img->mask[CGrey]];
1550                 else if(!alphaonly){
1551                         ured = rrepl[(u >> img->shift[CRed]) & img->mask[CRed]];
1552                         ugrn = grepl[(u >> img->shift[CGreen]) & img->mask[CGreen]];
1553                         ublu = brepl[(u >> img->shift[CBlue]) & img->mask[CBlue]];
1554                         if(convgrey){
1555                                 *w++ = RGB2K(ured, ugrn, ublu);
1556                         }else{
1557                                 w[0] = ublu;
1558                                 w[1] = ugrn;
1559                                 w[2] = ured;
1560                                 w += 3;
1561                         }
1562                 }
1563                 r += nb;
1564                 if(r == end)
1565                         r = begin;
1566         }
1567
1568         b.alpha = copyalpha ? buf : &ones;
1569         b.rgba = (ulong*)buf;
1570         if(alphaonly){
1571                 b.red = b.grn = b.blu = b.grey = nil;
1572                 if(!copyalpha)
1573                         b.rgba = nil;
1574                 b.delta = 1;
1575         }else if(isgrey || convgrey){
1576                 b.grey = buf+copyalpha;
1577                 b.red = b.grn = b.blu = buf+copyalpha;
1578                 b.delta = copyalpha+1;
1579         }else{
1580                 b.blu = buf+copyalpha;
1581                 b.grn = buf+copyalpha+1;
1582                 b.grey = nil;
1583                 b.red = buf+copyalpha+2;
1584                 b.delta = copyalpha+3;
1585         }
1586         return b;
1587 }
1588
1589 static void
1590 writebyte(Param *p, uchar *w, Buffer src)
1591 {
1592         Memimage *img;
1593         int i, isalpha, isgrey, nb, delta, dx, adelta;
1594         uchar *red, *grn, *blu, *grey, *alpha;
1595         ulong u, mask;
1596
1597         img = p->img;
1598
1599         red = src.red;
1600         grn = src.grn;
1601         blu = src.blu;
1602         alpha = src.alpha;
1603         delta = src.delta;
1604         grey = src.grey;
1605         dx = p->dx;
1606
1607         nb = img->depth/8;
1608
1609         isalpha = img->flags&Falpha;
1610         isgrey = img->flags&Fgrey;
1611         adelta = src.delta;
1612
1613         if(isalpha && alpha == &ones)
1614                 adelta = 0;
1615
1616         if((img->flags&Fbytes) != 0){
1617                 int ogry, ored, ogrn, oblu, oalp;
1618
1619                 ogry = img->shift[CGrey]/8;
1620                 ored = img->shift[CRed]/8;
1621                 ogrn = img->shift[CGreen]/8;
1622                 oblu = img->shift[CBlue]/8;
1623                 oalp = img->shift[CAlpha]/8;
1624
1625                 for(i=0; i<dx; i++){
1626                         if(isgrey){
1627                                 w[ogry] = *grey;
1628                                 grey += delta;
1629                         } else {
1630                                 w[ored] = *red;
1631                                 w[ogrn] = *grn;
1632                                 w[oblu] = *blu;
1633                                 red += delta;
1634                                 grn += delta;
1635                                 blu += delta;
1636                         }
1637                         if(isalpha){
1638                                 w[oalp] = *alpha;
1639                                 alpha += adelta;
1640                         }
1641                         w += nb;
1642                 }
1643                 return;
1644         }
1645
1646         mask = (nb==4) ? 0 : ~((1<<img->depth)-1);
1647         for(i=0; i<dx; i++){
1648                 u = w[0] | (w[1]<<8) | (w[2]<<16) | (w[3]<<24);
1649                 u &= mask;
1650                 if(isgrey){
1651                         u |= ((*grey >> (8-img->nbits[CGrey])) & img->mask[CGrey]) << img->shift[CGrey];
1652                         grey += delta;
1653                 }else{
1654                         u |= ((*red >> (8-img->nbits[CRed])) & img->mask[CRed]) << img->shift[CRed];
1655                         u |= ((*grn >> (8-img->nbits[CGreen])) & img->mask[CGreen]) << img->shift[CGreen];
1656                         u |= ((*blu >> (8-img->nbits[CBlue])) & img->mask[CBlue]) << img->shift[CBlue];
1657                         red += delta;
1658                         grn += delta;
1659                         blu += delta;
1660                 }
1661
1662                 if(isalpha){
1663                         u |= ((*alpha >> (8-img->nbits[CAlpha])) & img->mask[CAlpha]) << img->shift[CAlpha];
1664                         alpha += adelta;
1665                 }
1666
1667                 w[0] = u;
1668                 w[1] = u>>8;
1669                 w[2] = u>>16;
1670                 w[3] = u>>24;
1671                 w += nb;
1672         }
1673 }
1674
1675 static Readfn*
1676 readfn(Memimage *img)
1677 {
1678         if(img->depth < 8)
1679                 return readnbit;
1680         if(img->nbits[CMap] == 8)
1681                 return readcmap;
1682         return readbyte;
1683 }
1684
1685 static Readfn*
1686 readalphafn(Memimage *m)
1687 {
1688         USED(m);
1689         return readbyte;
1690 }
1691
1692 static Writefn*
1693 writefn(Memimage *img)
1694 {
1695         if(img->depth < 8)
1696                 return writenbit;
1697         if(img->nbits[CMap] == 8)
1698                 return writecmap;
1699         return writebyte;
1700 }
1701
1702 static void
1703 nullwrite(Param *p, uchar *s, Buffer b)
1704 {
1705         USED(p);
1706         USED(s);
1707         USED(b);
1708 }
1709
1710 static Buffer
1711 readptr(Param *p, uchar *s, int y)
1712 {
1713         Buffer b;
1714         uchar *q;
1715
1716         USED(s);
1717         q = p->bytermin + y*p->bwidth;
1718         b.red = q;      /* ptr to data */
1719         b.grn = b.blu = b.grey = nil;
1720         b.alpha = &ones;
1721         b.rgba = (ulong*)q;
1722         b.delta = p->img->depth/8;
1723         return b;
1724 }
1725
1726 static Buffer
1727 boolmemmove(Buffer bdst, Buffer bsrc, Buffer b1, int dx, int i, int o)
1728 {
1729         USED(i);
1730         USED(o);
1731         USED(b1);
1732         USED(bsrc);
1733         memmove(bdst.red, bsrc.red, dx*bdst.delta);
1734         return bdst;
1735 }
1736
1737 static Buffer
1738 boolcopy8(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1739 {
1740         uchar *m, *r, *w, *ew;
1741
1742         USED(i);
1743         USED(o);
1744         m = bmask.grey;
1745         w = bdst.red;
1746         r = bsrc.red;
1747         ew = w+dx;
1748         for(; w < ew; w++,r++)
1749                 if(*m++)
1750                         *w = *r;
1751         return bdst;    /* not used */
1752 }
1753
1754 static Buffer
1755 boolcopy16(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1756 {
1757         uchar *m;
1758         ushort *r, *w, *ew;
1759
1760         USED(i);
1761         USED(o);
1762         m = bmask.grey;
1763         w = (ushort*)bdst.red;
1764         r = (ushort*)bsrc.red;
1765         ew = w+dx;
1766         for(; w < ew; w++,r++)
1767                 if(*m++)
1768                         *w = *r;
1769         return bdst;    /* not used */
1770 }
1771
1772 static Buffer
1773 boolcopy24(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1774 {
1775         uchar *m;
1776         uchar *r, *w, *ew;
1777
1778         USED(i);
1779         USED(o);
1780         m = bmask.grey;
1781         w = bdst.red;
1782         r = bsrc.red;
1783         ew = w+dx*3;
1784         while(w < ew){
1785                 if(*m++){
1786                         *w++ = *r++;
1787                         *w++ = *r++;
1788                         *w++ = *r++;
1789                 }else{
1790                         w += 3;
1791                         r += 3;
1792                 }
1793         }
1794         return bdst;    /* not used */
1795 }
1796
1797 static Buffer
1798 boolcopy32(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1799 {
1800         uchar *m;
1801         ulong *r, *w, *ew;
1802
1803         USED(i);
1804         USED(o);
1805         m = bmask.grey;
1806         w = (ulong*)bdst.red;
1807         r = (ulong*)bsrc.red;
1808         ew = w+dx;
1809         for(; w < ew; w++,r++)
1810                 if(*m++)
1811                         *w = *r;
1812         return bdst;    /* not used */
1813 }
1814
1815 static Buffer
1816 genconv(Param *p, uchar *buf, int y)
1817 {
1818         Buffer b;
1819         int nb;
1820         uchar *r, *w, *ew;
1821
1822         /* read from source into RGB format in convbuf */
1823         b = p->convreadcall(p, p->convbuf, y);
1824
1825         /* write RGB format into dst format in buf */
1826         p->convwritecall(p->convdpar, buf, b);
1827
1828         if(p->convdx){
1829                 nb = p->convdpar->img->depth/8;
1830                 r = buf;
1831                 w = buf+nb*p->dx;
1832                 ew = buf+nb*p->convdx;
1833                 while(w<ew)
1834                         *w++ = *r++;
1835         }
1836
1837         b.red = buf;
1838         b.blu = b.grn = b.grey = nil;
1839         b.alpha = &ones;
1840         b.rgba = (ulong*)buf;
1841         b.delta = 0;
1842         
1843         return b;
1844 }
1845
1846 static Readfn*
1847 convfn(Memimage *dst, Param *dpar, Memimage *src, Param *spar, int *ndrawbuf)
1848 {
1849         if(dst->chan == src->chan && !(src->flags&Frepl))
1850                 return readptr;
1851
1852         if(dst->chan==CMAP8 && (src->chan==GREY1||src->chan==GREY2||src->chan==GREY4)){
1853                 /* cheat because we know the replicated value is exactly the color map entry. */
1854                 return readnbit;
1855         }
1856
1857         spar->convreadcall = readfn(src);
1858         spar->convwritecall = writefn(dst);
1859         spar->convdpar = dpar;
1860
1861         /* allocate a conversion buffer */
1862         spar->convbufoff = *ndrawbuf;
1863         *ndrawbuf += spar->dx*4;
1864
1865         if(spar->dx > Dx(spar->img->r)){
1866                 spar->convdx = spar->dx;
1867                 spar->dx = Dx(spar->img->r);
1868         }
1869
1870         return genconv;
1871 }
1872
1873 static ulong
1874 pixelbits(Memimage *i, Point pt)
1875 {
1876         uchar *p;
1877         ulong val;
1878         int off, bpp, npack;
1879
1880         val = 0;
1881         p = byteaddr(i, pt);
1882         switch(bpp=i->depth){
1883         case 1:
1884         case 2:
1885         case 4:
1886                 npack = 8/bpp;
1887                 off = pt.x%npack;
1888                 val = p[0] >> bpp*(npack-1-off);
1889                 val &= (1<<bpp)-1;
1890                 break;
1891         case 8:
1892                 val = p[0];
1893                 break;
1894         case 16:
1895                 val = p[0]|(p[1]<<8);
1896                 break;
1897         case 24:
1898                 val = p[0]|(p[1]<<8)|(p[2]<<16);
1899                 break;
1900         case 32:
1901                 val = p[0]|(p[1]<<8)|(p[2]<<16)|(p[3]<<24);
1902                 break;
1903         }
1904         while(bpp<32){
1905                 val |= val<<bpp;
1906                 bpp *= 2;
1907         }
1908         return val;
1909 }
1910
1911 static Calcfn*
1912 boolcopyfn(Memimage *img, Memimage *mask)
1913 {
1914         if(mask->flags&Frepl && Dx(mask->r)==1 && Dy(mask->r)==1 && pixelbits(mask, mask->r.min)==~0)
1915                 return boolmemmove;
1916
1917         switch(img->depth){
1918         case 8:
1919                 return boolcopy8;
1920         case 16:
1921                 return boolcopy16;
1922         case 24:
1923                 return boolcopy24;
1924         case 32:
1925                 return boolcopy32;
1926         default:
1927                 assert(0 /* boolcopyfn */);
1928         }
1929         return nil;
1930 }
1931
1932 /*
1933  * Optimized draw for filling and scrolling; uses memset and memmove.
1934  */
1935 static void
1936 memsets(void *vp, ushort val, int n)
1937 {
1938         ushort *p, *ep;
1939         uchar b[2];
1940
1941         /* make little endian */
1942         b[0] = val;
1943         b[1] = val>>8;
1944         val = *(ushort*)b;
1945
1946         p = vp;
1947         ep = p+n;
1948         while(p<ep)
1949                 *p++ = val;
1950 }
1951
1952 static void
1953 memsetl(void *vp, ulong val, int n)
1954 {
1955         ulong *p, *ep;
1956         uchar b[4];
1957
1958         /* make little endian */
1959         b[0] = val;
1960         b[1] = val>>8;
1961         b[2] = val>>16;
1962         b[3] = val>>24;
1963         val = *(ulong*)b;
1964
1965         p = vp;
1966         ep = p+n;
1967         while(p<ep)
1968                 *p++ = val;
1969 }
1970
1971 static void
1972 memset24(void *vp, ulong val, int n)
1973 {
1974         uchar *p, *ep;
1975         uchar a,b,c;
1976
1977         a = val;
1978         b = val>>8;
1979         c = val>>16;
1980
1981         p = vp;
1982         ep = p+3*n;
1983         while(p<ep){
1984                 p[0] = a;
1985                 p[1] = b;
1986                 p[2] = c;
1987                 p += 3;
1988         }
1989 }
1990
1991 static ulong
1992 imgtorgba(Memimage *img, ulong val)
1993 {
1994         uchar r, g, b, a;
1995         int nb, ov, v;
1996         ulong chan;
1997         uchar *p;
1998
1999         a = 0xFF;
2000         r = g = b = 0xAA;       /* garbage */
2001         for(chan=img->chan; chan; chan>>=8){
2002                 nb = NBITS(chan);
2003                 ov = v = val&((1<<nb)-1);
2004                 val >>= nb;
2005
2006                 while(nb < 8){
2007                         v |= v<<nb;
2008                         nb *= 2;
2009                 }
2010                 v >>= (nb-8);
2011
2012                 switch(TYPE(chan)){
2013                 case CRed:
2014                         r = v;
2015                         break;
2016                 case CGreen:
2017                         g = v;
2018                         break;
2019                 case CBlue:
2020                         b = v;
2021                         break;
2022                 case CAlpha:
2023                         a = v;
2024                         break;
2025                 case CGrey:
2026                         r = g = b = v;
2027                         break;
2028                 case CMap:
2029                         p = img->cmap->cmap2rgb+3*ov;
2030                         r = p[0];
2031                         g = p[1];
2032                         b = p[2];
2033                         break;
2034                 }
2035         }
2036         return (r<<24)|(g<<16)|(b<<8)|a;        
2037 }
2038
2039 static ulong
2040 rgbatoimg(Memimage *img, ulong rgba)
2041 {
2042         ulong chan;
2043         int d, nb;
2044         ulong v;
2045         uchar *p, r, g, b, a, m;
2046
2047         v = 0;
2048         r = rgba>>24;
2049         g = rgba>>16;
2050         b = rgba>>8;
2051         a = rgba;
2052         d = 0;
2053         for(chan=img->chan; chan; chan>>=8){
2054                 nb = NBITS(chan);
2055                 switch(TYPE(chan)){
2056                 case CRed:
2057                         v |= (r>>(8-nb))<<d;
2058                         break;
2059                 case CGreen:
2060                         v |= (g>>(8-nb))<<d;
2061                         break;
2062                 case CBlue:
2063                         v |= (b>>(8-nb))<<d;
2064                         break;
2065                 case CAlpha:
2066                         v |= (a>>(8-nb))<<d;
2067                         break;
2068                 case CMap:
2069                         p = img->cmap->rgb2cmap;
2070                         m = p[(r>>4)*256+(g>>4)*16+(b>>4)];
2071                         v |= (m>>(8-nb))<<d;
2072                         break;
2073                 case CGrey:
2074                         m = RGB2K(r,g,b);
2075                         v |= (m>>(8-nb))<<d;
2076                         break;
2077                 }
2078                 d += nb;
2079         }
2080         return v;
2081 }
2082
2083 static int
2084 memoptdraw(Memdrawparam *par)
2085 {
2086         int m, y, dy, dx, op;
2087         ulong v;
2088         Memimage *src;
2089         Memimage *dst;
2090
2091         dx = Dx(par->r);
2092         dy = Dy(par->r);
2093         src = par->src;
2094         dst = par->dst;
2095         op = par->op;
2096
2097         /*
2098          * If we have an opaque mask and source is one opaque pixel we can convert to the
2099          * destination format and just replicate with memset.
2100          */
2101         m = Simplesrc|Simplemask|Fullmask;
2102         if((par->state&m)==m && (par->srgba&0xFF) == 0xFF && (op ==S || op == SoverD)){
2103                 int d, dwid, ppb, np, nb;
2104                 uchar *dp, lm, rm;
2105
2106                 dwid = dst->width*sizeof(ulong);
2107                 dp = byteaddr(dst, par->r.min);
2108                 v = par->sdval;
2109                 switch(dst->depth){
2110                 case 1:
2111                 case 2:
2112                 case 4:
2113                         for(d=dst->depth; d<8; d*=2)
2114                                 v |= (v<<d);
2115                         ppb = 8/dst->depth;     /* pixels per byte */
2116                         m = ppb-1;
2117                         /* left edge */
2118                         np = par->r.min.x&m;            /* no. pixels unused on left side of word */
2119                         dx -= (ppb-np);
2120                         nb = 8 - np * dst->depth;               /* no. bits used on right side of word */
2121                         lm = (1<<nb)-1;
2122
2123                         /* right edge */
2124                         np = par->r.max.x&m;    /* no. pixels used on left side of word */
2125                         dx -= np;
2126                         nb = 8 - np * dst->depth;               /* no. bits unused on right side of word */
2127                         rm = ~((1<<nb)-1);
2128
2129                         /* lm, rm are masks that are 1 where we should touch the bits */
2130                         if(dx < 0){     /* just one byte */
2131                                 lm &= rm;
2132                                 for(y=0; y<dy; y++, dp+=dwid)
2133                                         *dp ^= (v ^ *dp) & lm;
2134                         }else if(dx == 0){      /* no full bytes */
2135                                 if(lm)
2136                                         dwid--;
2137
2138                                 for(y=0; y<dy; y++, dp+=dwid){
2139                                         if(lm){
2140                                                 *dp ^= (v ^ *dp) & lm;
2141                                                 dp++;
2142                                         }
2143                                         *dp ^= (v ^ *dp) & rm;
2144                                 }
2145                         }else{          /* full bytes in middle */
2146                                 dx /= ppb;
2147                                 if(lm)
2148                                         dwid--;
2149                                 dwid -= dx;
2150
2151                                 for(y=0; y<dy; y++, dp+=dwid){
2152                                         if(lm){
2153                                                 *dp ^= (v ^ *dp) & lm;
2154                                                 dp++;
2155                                         }
2156                                         memset(dp, v, dx);
2157                                         dp += dx;
2158                                         *dp ^= (v ^ *dp) & rm;
2159                                 }
2160                         }
2161                         return 1;
2162                 case 8:
2163                         for(y=0; y<dy; y++, dp+=dwid)
2164                                 memset(dp, v, dx);
2165                         return 1;
2166                 case 16:
2167                         for(y=0; y<dy; y++, dp+=dwid)
2168                                 memsets(dp, v, dx);
2169                         return 1;
2170                 case 24:
2171                         for(y=0; y<dy; y++, dp+=dwid)
2172                                 memset24(dp, v, dx);
2173                         return 1;
2174                 case 32:
2175                         for(y=0; y<dy; y++, dp+=dwid)
2176                                 memsetl(dp, v, dx);
2177                         return 1;
2178                 default:
2179                         assert(0 /* bad dest depth in memoptdraw */);
2180                 }
2181         }
2182
2183         /*
2184          * If no source alpha, an opaque mask, we can just copy the
2185          * source onto the destination.  If the channels are the same and
2186          * the source is not replicated, memmove suffices.
2187          */
2188         m = Simplemask|Fullmask;
2189         if((par->state&(m|Replsrc))==m && src->depth >= 8 
2190         && src->chan == dst->chan && !(src->flags&Falpha) && (op == S || op == SoverD)){
2191                 uchar *sp, *dp;
2192                 long swid, dwid, nb;
2193                 int dir;
2194
2195                 if(src->data == dst->data && byteaddr(dst, par->r.min) > byteaddr(src, par->sr.min))
2196                         dir = -1;
2197                 else
2198                         dir = 1;
2199
2200                 swid = src->width*sizeof(ulong);
2201                 dwid = dst->width*sizeof(ulong);
2202                 sp = byteaddr(src, par->sr.min);
2203                 dp = byteaddr(dst, par->r.min);
2204                 if(dir == -1){
2205                         sp += (dy-1)*swid;
2206                         dp += (dy-1)*dwid;
2207                         swid = -swid;
2208                         dwid = -dwid;
2209                 }
2210                 nb = (dx*src->depth)/8;
2211                 for(y=0; y<dy; y++, sp+=swid, dp+=dwid)
2212                         memmove(dp, sp, nb);
2213                 return 1;
2214         }
2215
2216         /*
2217          * If we have a 1-bit mask, 1-bit source, and 1-bit destination, and
2218          * they're all bit aligned, we can just use bit operators.  This happens
2219          * when we're manipulating boolean masks, e.g. in the arc code.
2220          */
2221         if((par->state&(Simplemask|Simplesrc|Replmask|Replsrc))==0 
2222         && dst->chan==GREY1 && src->chan==GREY1 && par->mask->chan==GREY1 
2223         && (par->r.min.x&7)==(par->sr.min.x&7) && (par->r.min.x&7)==(par->mr.min.x&7)){
2224                 uchar *sp, *dp, *mp;
2225                 uchar lm, rm;
2226                 long swid, dwid, mwid;
2227                 int i, x, dir;
2228
2229                 sp = byteaddr(src, par->sr.min);
2230                 dp = byteaddr(dst, par->r.min);
2231                 mp = byteaddr(par->mask, par->mr.min);
2232                 swid = src->width*sizeof(ulong);
2233                 dwid = dst->width*sizeof(ulong);
2234                 mwid = par->mask->width*sizeof(ulong);
2235
2236                 if(src->data == dst->data && byteaddr(dst, par->r.min) > byteaddr(src, par->sr.min)){
2237                         dir = -1;
2238                 }else
2239                         dir = 1;
2240
2241                 lm = 0xFF>>(par->r.min.x&7);
2242                 rm = 0xFF<<(8-(par->r.max.x&7));
2243                 dx -= (8-(par->r.min.x&7)) + (par->r.max.x&7);
2244
2245                 if(dx < 0){     /* one byte wide */
2246                         lm &= rm;
2247                         if(dir == -1){
2248                                 dp += dwid*(dy-1);
2249                                 sp += swid*(dy-1);
2250                                 mp += mwid*(dy-1);
2251                                 dwid = -dwid;
2252                                 swid = -swid;
2253                                 mwid = -mwid;
2254                         }
2255                         for(y=0; y<dy; y++){
2256                                 *dp ^= (*dp ^ *sp) & *mp & lm;
2257                                 dp += dwid;
2258                                 sp += swid;
2259                                 mp += mwid;
2260                         }
2261                         return 1;
2262                 }
2263
2264                 dx /= 8;
2265                 if(dir == 1){
2266                         i = (lm!=0)+dx+(rm!=0);
2267                         mwid -= i;
2268                         swid -= i;
2269                         dwid -= i;
2270                         for(y=0; y<dy; y++, dp+=dwid, sp+=swid, mp+=mwid){
2271                                 if(lm){
2272                                         *dp ^= (*dp ^ *sp++) & *mp++ & lm;
2273                                         dp++;
2274                                 }
2275                                 for(x=0; x<dx; x++){
2276                                         *dp ^= (*dp ^ *sp++) & *mp++;
2277                                         dp++;
2278                                 }
2279                                 if(rm){
2280                                         *dp ^= (*dp ^ *sp++) & *mp++ & rm;
2281                                         dp++;
2282                                 }
2283                         }
2284                         return 1;
2285                 }else{
2286                 /* dir == -1 */
2287                         i = (lm!=0)+dx+(rm!=0);
2288                         dp += dwid*(dy-1)+i-1;
2289                         sp += swid*(dy-1)+i-1;
2290                         mp += mwid*(dy-1)+i-1;
2291                         dwid = -dwid+i;
2292                         swid = -swid+i;
2293                         mwid = -mwid+i;
2294                         for(y=0; y<dy; y++, dp+=dwid, sp+=swid, mp+=mwid){
2295                                 if(rm){
2296                                         *dp ^= (*dp ^ *sp--) & *mp-- & rm;
2297                                         dp--;
2298                                 }
2299                                 for(x=0; x<dx; x++){
2300                                         *dp ^= (*dp ^ *sp--) & *mp--;
2301                                         dp--;
2302                                 }
2303                                 if(lm){
2304                                         *dp ^= (*dp ^ *sp--) & *mp-- & lm;
2305                                         dp--;
2306                                 }
2307                         }
2308                 }
2309                 return 1;
2310         }
2311         return 0;       
2312 }
2313
2314 /*
2315  * Boolean character drawing.
2316  * Solid opaque color through a 1-bit greyscale mask.
2317  */
2318 static int
2319 chardraw(Memdrawparam *par)
2320 {
2321         int i, ddepth, dy, dx, x, bx, ex, y, npack, bsh, depth, op;
2322         ulong bits, v, maskwid, dstwid;
2323         uchar *wp, *rp, *q, *wc;
2324         ushort *ws;
2325         ulong *wl;
2326         uchar sp[4];
2327         Rectangle r, mr;
2328         Memimage *mask, *src, *dst;
2329
2330         mask = par->mask;
2331         src = par->src;
2332         dst = par->dst;
2333         r = par->r;
2334         mr = par->mr;
2335         op = par->op;
2336
2337         if((par->state&(Replsrc|Simplesrc|Replmask)) != (Replsrc|Simplesrc)
2338         || mask->depth != 1 || src->flags&Falpha || dst->depth<8 || dst->data==src->data
2339         || op != SoverD)
2340                 return 0;
2341
2342         depth = mask->depth;
2343         maskwid = mask->width*sizeof(ulong);
2344         rp = byteaddr(mask, mr.min);
2345         npack = 8/depth;
2346         bsh = (mr.min.x % npack) * depth;
2347
2348         wp = byteaddr(dst, r.min);
2349         dstwid = dst->width*sizeof(ulong);
2350         dy = Dy(r);
2351         dx = Dx(r);
2352
2353         ddepth = dst->depth;
2354
2355         /*
2356          * for loop counts from bsh to bsh+dx
2357          *
2358          * we want the bottom bits to be the amount
2359          * to shift the pixels down, so for n≡0 (mod 8) we want 
2360          * bottom bits 7.  for n≡1, 6, etc.
2361          * the bits come from -n-1.
2362          */
2363
2364         bx = -bsh-1;
2365         ex = -bsh-1-dx;
2366         SET(bits);
2367         v = par->sdval;
2368
2369         /* make little endian */
2370         sp[0] = v;
2371         sp[1] = v>>8;
2372         sp[2] = v>>16;
2373         sp[3] = v>>24;
2374
2375         for(y=0; y<dy; y++, rp+=maskwid, wp+=dstwid){
2376                 q = rp;
2377                 if(bsh)
2378                         bits = *q++;
2379                 switch(ddepth){
2380                 case 8:
2381                         wc = wp;
2382                         for(x=bx; x>ex; x--, wc++){
2383                                 i = x&7;
2384                                 if(i == 8-1)
2385                                         bits = *q++;
2386                                 if((bits>>i)&1)
2387                                         *wc = v;
2388                         }
2389                         break;
2390                 case 16:
2391                         ws = (ushort*)wp;
2392                         v = *(ushort*)sp;
2393                         for(x=bx; x>ex; x--, ws++){
2394                                 i = x&7;
2395                                 if(i == 8-1)
2396                                         bits = *q++;
2397                                 if((bits>>i)&1)
2398                                         *ws = v;
2399                         }
2400                         break;
2401                 case 24:
2402                         wc = wp;
2403                         for(x=bx; x>ex; x--, wc+=3){
2404                                 i = x&7;
2405                                 if(i == 8-1)
2406                                         bits = *q++;
2407                                 if((bits>>i)&1){
2408                                         wc[0] = sp[0];
2409                                         wc[1] = sp[1];
2410                                         wc[2] = sp[2];
2411                                 }
2412                         }
2413                         break;
2414                 case 32:
2415                         wl = (ulong*)wp;
2416                         v = *(ulong*)sp;
2417                         for(x=bx; x>ex; x--, wl++){
2418                                 i = x&7;
2419                                 if(i == 8-1)
2420                                         bits = *q++;
2421                                 if((bits>>i)&1)
2422                                         *wl = v;
2423                         }
2424                         break;
2425                 }
2426         }
2427         return 1;       
2428 }
2429
2430
2431 void
2432 memfillcolor(Memimage *i, ulong val)
2433 {
2434         ulong bits;
2435         int d, y;
2436
2437         if(val == DNofill)
2438                 return;
2439
2440         bits = rgbatoimg(i, val);
2441         switch(i->depth){
2442         case 24:        /* 24-bit images suck */
2443                 for(y=i->r.min.y; y<i->r.max.y; y++)
2444                         memset24(byteaddr(i, Pt(i->r.min.x, y)), bits, Dx(i->r));
2445                 break;
2446         default:        /* 1, 2, 4, 8, 16, 32 */
2447                 for(d=i->depth; d<32; d*=2)
2448                         bits = (bits << d) | bits;
2449                 memsetl(wordaddr(i, i->r.min), bits, i->width*Dy(i->r));
2450                 break;
2451         }
2452 }
2453