sys/src/libmemdraw/draw.c

   1 #include <u.h>
   2 #include <libc.h>
   3 #include <draw.h>
   4 #include <memdraw.h>
   5 #include <pool.h>
   6
   7 extern Pool* imagmem;
   8 int drawdebug;
   9
  10 /* perfect approximation to NTSC = .299r+.587g+.114b when 0 ≤ r,g,b < 256 */
  11 #define RGB2K(r,g,b)    ((156763*(r)+307758*(g)+59769*(b))>>19)
  12
  13 /*
  14  * For 16-bit values, x / 255 == (t = x+1, (t+(t>>8)) >> 8).
  15  * We add another 127 to round to the nearest value rather
  16  * than truncate.
  17  *
  18  * CALCxy does x bytewise calculations on y input images (x=1,4; y=1,2).
  19  * CALC2x does two parallel 16-bit calculations on y input images (y=1,2).
  20  */
  21 #define CALC11(a, v, tmp) \
  22         (tmp=(a)*(v)+128, (tmp+(tmp>>8))>>8)
  23
  24 #define CALC12(a1, v1, a2, v2, tmp) \
  25         (tmp=(a1)*(v1)+(a2)*(v2)+128, (tmp+(tmp>>8))>>8)
  26
  27 #define MASK 0xFF00FF
  28
  29 #define CALC21(a, vvuu, tmp) \
  30         (tmp=(a)*(vvuu)+0x00800080, ((tmp+((tmp>>8)&MASK))>>8)&MASK)
  31
  32 #define CALC41(a, rgba, tmp1, tmp2) \
  33         (CALC21(a, rgba & MASK, tmp1) | \
  34          (CALC21(a, (rgba>>8)&MASK, tmp2)<<8))
  35
  36 #define CALC22(a1, vvuu1, a2, vvuu2, tmp) \
  37         (tmp=(a1)*(vvuu1)+(a2)*(vvuu2)+0x00800080, ((tmp+((tmp>>8)&MASK))>>8)&MASK)
  38
  39 #define CALC42(a1, rgba1, a2, rgba2, tmp1, tmp2) \
  40         (CALC22(a1, rgba1 & MASK, a2, rgba2 & MASK, tmp1) | \
  41          (CALC22(a1, (rgba1>>8) & MASK, a2, (rgba2>>8) & MASK, tmp2)<<8))
  42
  43 static void mktables(void);
  44 typedef int Subdraw(Memdrawparam*);
  45 static Subdraw chardraw, alphadraw, memoptdraw;
  46
  47 static Memimage*        memones;
  48 static Memimage*        memzeros;
  49 Memimage *memwhite;
  50 Memimage *memblack;
  51 Memimage *memtransparent;
  52 Memimage *memopaque;
  53
  54 int     _ifmt(Fmt*);
  55
  56 int
  57 memimageinit(void)
  58 {
  59         static int didinit = 0;
  60
  61         if(didinit)
  62                 return 0;
  63
  64         if(imagmem != nil)
  65         if(strcmp(imagmem->name, "Image") == 0 || strcmp(imagmem->name, "image") == 0)
  66                 imagmem->move = memimagemove;
  67
  68         mktables();
  69         _memmkcmap();
  70
  71         fmtinstall('R', Rfmt);
  72         fmtinstall('P', Pfmt);
  73         fmtinstall('b', _ifmt);
  74
  75         memones = allocmemimage(Rect(0,0,1,1), GREY1);
  76         memzeros = allocmemimage(Rect(0,0,1,1), GREY1);
  77         if(memones == nil || memzeros == nil)
  78                 return -1;
  79
  80         memones->flags |= Frepl;
  81         memones->clipr = Rect(-0x3FFFFFF, -0x3FFFFFF, 0x3FFFFFF, 0x3FFFFFF);
  82         *byteaddr(memones, ZP) = ~0;
  83
  84         memzeros->flags |= Frepl;
  85         memzeros->clipr = Rect(-0x3FFFFFF, -0x3FFFFFF, 0x3FFFFFF, 0x3FFFFFF);
  86         *byteaddr(memzeros, ZP) = 0;
  87
  88         memwhite = memones;
  89         memblack = memzeros;
  90         memopaque = memones;
  91         memtransparent = memzeros;
  92
  93         didinit = 1;
  94         return 0;
  95 }
  96
  97 static ulong imgtorgba(Memimage*, ulong);
  98 static ulong rgbatoimg(Memimage*, ulong);
  99 static ulong pixelbits(Memimage*, Point);
 100
 101 #define DBG if(0)
 102 void
 103 memimagedraw(Memimage *dst, Rectangle r, Memimage *src, Point p0, Memimage *mask, Point p1, int op)
 104 {
 105         static int n = 0;
 106         Memdrawparam par;
 107
 108         if(mask == nil)
 109                 mask = memopaque;
 110
 111 DBG     print("memimagedraw %p/%luX %R @ %p %p/%luX %P %p/%luX %P... ", dst, dst->chan, r, dst->data->bdata, src, src->chan, p0, mask, mask->chan, p1);
 112
 113         if(drawclip(dst, &r, src, &p0, mask, &p1, &par.sr, &par.mr) == 0){
 114 //              if(drawdebug)
 115 //                      iprint("empty clipped rectangle\n");
 116                 return;
 117         }
 118
 119         if(op < Clear || op > SoverD){
 120 //              if(drawdebug)
 121 //                      iprint("op out of range: %d\n", op);
 122                 return;
 123         }
 124
 125         par.op = op;
 126         par.dst = dst;
 127         par.r = r;
 128         par.src = src;
 129         /* par.sr set by drawclip */
 130         par.mask = mask;
 131         /* par.mr set by drawclip */
 132
 133         par.state = 0;
 134         if(src->flags&Frepl){
 135                 par.state |= Replsrc;
 136                 if(Dx(src->r)==1 && Dy(src->r)==1){
 137                         par.sval = pixelbits(src, src->r.min);
 138                         par.state |= Simplesrc;
 139                         par.srgba = imgtorgba(src, par.sval);
 140                         par.sdval = rgbatoimg(dst, par.srgba);
 141                         if((par.srgba&0xFF) == 0 && (op&DoutS)){
 142 //                              if (drawdebug) iprint("fill with transparent source\n");
 143                                 return; /* no-op successfully handled */
 144                         }
 145                 }
 146         }
 147
 148         if(mask->flags & Frepl){
 149                 par.state |= Replmask;
 150                 if(Dx(mask->r)==1 && Dy(mask->r)==1){
 151                         par.mval = pixelbits(mask, mask->r.min);
 152                         if(par.mval == 0 && (op&DoutS)){
 153 //                              if(drawdebug) iprint("fill with zero mask\n");
 154                                 return; /* no-op successfully handled */
 155                         }
 156                         par.state |= Simplemask;
 157                         if(par.mval == ~0)
 158                                 par.state |= Fullmask;
 159                         par.mrgba = imgtorgba(mask, par.mval);
 160                 }
 161         }
 162
 163 //      if(drawdebug)
 164 //              iprint("dr %R sr %R mr %R...", r, par.sr, par.mr);
 165 DBG print("draw dr %R sr %R mr %R %lux\n", r, par.sr, par.mr, par.state);
 166
 167         /*
 168          * Now that we've clipped the parameters down to be consistent, we
 169          * simply try sub-drawing routines in order until we find one that was able
 170          * to handle us.  If the sub-drawing routine returns zero, it means it was
 171          * unable to satisfy the request, so we do not return.
 172          */
 173
 174         /*
 175          * Hardware support.  Each video driver provides this function,
 176          * which checks to see if there is anything it can help with.
 177          * There could be an if around this checking to see if dst is in video memory.
 178          */
 179 DBG print("test hwdraw\n");
 180         if(hwdraw(&par)){
 181 //if(drawdebug) iprint("hw handled\n");
 182 DBG print("hwdraw handled\n");
 183                 return;
 184         }
 185         /*
 186          * Optimizations using memmove and memset.
 187          */
 188 DBG print("test memoptdraw\n");
 189         if(memoptdraw(&par)){
 190 //if(drawdebug) iprint("memopt handled\n");
 191 DBG print("memopt handled\n");
 192                 return;
 193         }
 194
 195         /*
 196          * Character drawing.
 197          * Solid source color being painted through a boolean mask onto a high res image.
 198          */
 199 DBG print("test chardraw\n");
 200         if(chardraw(&par)){
 201 //if(drawdebug) iprint("chardraw handled\n");
 202 DBG print("chardraw handled\n");
 203                 return;
 204         }
 205
 206         /*
 207          * General calculation-laden case that does alpha for each pixel.
 208          */
 209 DBG print("do alphadraw\n");
 210         alphadraw(&par);
 211 //if(drawdebug) iprint("alphadraw handled\n");
 212 DBG print("alphadraw handled\n");
 213 }
 214 #undef DBG
 215
 216
 217 /*
 218  * Clip the destination rectangle further based on the properties of the
 219  * source and mask rectangles.  Once the destination rectangle is properly
 220  * clipped, adjust the source and mask rectangles to be the same size.
 221  *
 222  * Return zero if the final rectangle is null.
 223  */
 224 int
 225 drawclipnorepl(Memimage *dst, Rectangle *r, Memimage *src, Point *p0, Memimage *mask, Point *p1, Rectangle *sr, Rectangle *mr)
 226 {
 227         Point rmin, delta;
 228         int splitcoords;
 229         Rectangle omr;
 230
 231         if(badrect(*r))
 232                 return 0;
 233         splitcoords = (p0->x!=p1->x) || (p0->y!=p1->y);
 234         /* clip to destination */
 235         rmin = r->min;
 236         if(!rectclip(r, dst->r) || !rectclip(r, dst->clipr))
 237                 return 0;
 238         /* move mask point */
 239         p1->x += r->min.x-rmin.x;
 240         p1->y += r->min.y-rmin.y;
 241         /* move source point */
 242         p0->x += r->min.x-rmin.x;
 243         p0->y += r->min.y-rmin.y;
 244         /* map destination rectangle into source */
 245         sr->min = *p0;
 246         sr->max.x = p0->x+Dx(*r);
 247         sr->max.y = p0->y+Dy(*r);
 248         /* sr is r in source coordinates; clip to source */
 249         if(!(src->flags&Frepl) && !rectclip(sr, src->r))
 250                 return 0;
 251         if(!rectclip(sr, src->clipr))
 252                 return 0;
 253         /* compute and clip rectangle in mask */
 254         if(splitcoords){
 255                 /* move mask point with source */
 256                 p1->x += sr->min.x-p0->x;
 257                 p1->y += sr->min.y-p0->y;
 258                 mr->min = *p1;
 259                 mr->max.x = p1->x+Dx(*sr);
 260                 mr->max.y = p1->y+Dy(*sr);
 261                 omr = *mr;
 262                 /* mr is now rectangle in mask; clip it */
 263                 if(!(mask->flags&Frepl) && !rectclip(mr, mask->r))
 264                         return 0;
 265                 if(!rectclip(mr, mask->clipr))
 266                         return 0;
 267                 /* reflect any clips back to source */
 268                 sr->min.x += mr->min.x-omr.min.x;
 269                 sr->min.y += mr->min.y-omr.min.y;
 270                 sr->max.x += mr->max.x-omr.max.x;
 271                 sr->max.y += mr->max.y-omr.max.y;
 272         }else{
 273                 if(!(mask->flags&Frepl) && !rectclip(sr, mask->r))
 274                         return 0;
 275                 if(!rectclip(sr, mask->clipr))
 276                         return 0;
 277                 *mr = *sr;
 278         }
 279         /* move source clipping back to destination */
 280         delta.x = r->min.x - p0->x;
 281         delta.y = r->min.y - p0->y;
 282         r->min.x = sr->min.x + delta.x;
 283         r->min.y = sr->min.y + delta.y;
 284         r->max.x = sr->max.x + delta.x;
 285         r->max.y = sr->max.y + delta.y;
 286         *p0 = sr->min;
 287         *p1 = mr->min;
 288
 289         assert(Dx(*sr) == Dx(*mr) && Dx(*mr) == Dx(*r));
 290         assert(Dy(*sr) == Dy(*mr) && Dy(*mr) == Dy(*r));
 291         assert(ptinrect(r->min, dst->r));
 292
 293         return 1;
 294 }
 295
 296 /*
 297  * like drawclipnorepl() above, but if source or mask is replicated,
 298  * move its clipped rectangle so that its minimum point falls within
 299  * the repl rectangle.
 300  *
 301  * Return zero if the final rectangle is null.
 302  */
 303 int
 304 drawclip(Memimage *dst, Rectangle *r, Memimage *src, Point *p0, Memimage *mask, Point *p1, Rectangle *sr, Rectangle *mr)
 305 {
 306         Point delta;
 307
 308         if(!drawclipnorepl(dst, r, src, p0, mask, p1, sr, mr))
 309                 return 0;
 310
 311         /* move source rectangle so sr->min is in src->r */
 312         if(src->flags&Frepl) {
 313                 delta.x = drawreplxy(src->r.min.x, src->r.max.x, sr->min.x) - sr->min.x;
 314                 delta.y = drawreplxy(src->r.min.y, src->r.max.y, sr->min.y) - sr->min.y;
 315                 sr->min.x += delta.x;
 316                 sr->min.y += delta.y;
 317                 sr->max.x += delta.x;
 318                 sr->max.y += delta.y;
 319                 *p0 = sr->min;
 320         }
 321
 322         /* move mask point so it is in mask->r */
 323         *p1 = drawrepl(mask->r, *p1);
 324         mr->min = *p1;
 325         mr->max.x = p1->x+Dx(*sr);
 326         mr->max.y = p1->y+Dy(*sr);
 327
 328         assert(ptinrect(*p0, src->r));
 329         assert(ptinrect(*p1, mask->r));
 330
 331         return 1;
 332 }
 333
 334 /*
 335  * Conversion tables.
 336  */
 337 static uchar replbit[1+8][256];         /* replbit[x][y] is the replication of the x-bit quantity y to 8-bit depth */
 338 static uchar conv18[256][8];            /* conv18[x][y] is the yth pixel in the depth-1 pixel x */
 339 static uchar conv28[256][4];            /* ... */
 340 static uchar conv48[256][2];
 341
 342 /*
 343  * bitmap of how to replicate n bits to fill 8, for 1 ≤ n ≤ 8.
 344  * the X's are where to put the bottom (ones) bit of the n-bit pattern.
 345  * only the top 8 bits of the result are actually used.
 346  * (the lower 8 bits are needed to get bits in the right place
 347  * when n is not a divisor of 8.)
 348  *
 349  * Should check to see if its easier to just refer to replmul than
 350  * use the precomputed values in replbit.  On PCs it may well
 351  * be; on machines with slow multiply instructions it probably isn't.
 352  */
 353 #define a ((((((((((((((((0
 354 #define X *2+1)
 355 #define _ *2)
 356 static int replmul[1+8] = {
 357         0,
 358         a X X X X X X X X X X X X X X X X,
 359         a _ X _ X _ X _ X _ X _ X _ X _ X,
 360         a _ _ X _ _ X _ _ X _ _ X _ _ X _,
 361         a _ _ _ X _ _ _ X _ _ _ X _ _ _ X,
 362         a _ _ _ _ X _ _ _ _ X _ _ _ _ X _,
 363         a _ _ _ _ _ X _ _ _ _ _ X _ _ _ _,
 364         a _ _ _ _ _ _ X _ _ _ _ _ _ X _ _,
 365         a _ _ _ _ _ _ _ X _ _ _ _ _ _ _ X,
 366 };
 367 #undef a
 368 #undef X
 369 #undef _
 370
 371 static void
 372 mktables(void)
 373 {
 374         int i, j, mask, sh, small;
 375
 376         /* bit replication up to 8 bits */
 377         for(i=0; i<256; i++){
 378                 for(j=0; j<=8; j++){    /* j <= 8 [sic] */
 379                         small = i & ((1<<j)-1);
 380                         replbit[j][i] = (small*replmul[j])>>8;
 381                 }
 382         }
 383
 384         /* bit unpacking up to 8 bits, only powers of 2 */
 385         for(i=0; i<256; i++){
 386                 for(j=0, sh=7, mask=1; j<8; j++, sh--)
 387                         conv18[i][j] = replbit[1][(i>>sh)&mask];
 388
 389                 for(j=0, sh=6, mask=3; j<4; j++, sh-=2)
 390                         conv28[i][j] = replbit[2][(i>>sh)&mask];
 391
 392                 for(j=0, sh=4, mask=15; j<2; j++, sh-=4)
 393                         conv48[i][j] = replbit[4][(i>>sh)&mask];
 394         }
 395 }
 396
 397 static uchar ones = 0xff;
 398
 399 /*
 400  * General alpha drawing case.  Can handle anything.
 401  */
 402 typedef struct  Buffer  Buffer;
 403 struct Buffer {
 404         /* used by most routines */
 405         uchar   *red;
 406         uchar   *grn;
 407         uchar   *blu;
 408         uchar   *alpha; /* is &ones when unused, never nil */
 409         uchar   *grey;
 410         ulong   *rgba;
 411         int     delta;  /* number of bytes to add to pointer to get next pixel to the right */
 412
 413         /* used by boolcalc* for mask data */
 414         uchar   *m;             /* ptr to mask data r.min byte; like p->bytermin */
 415         int             mskip;  /* no. of left bits to skip in *m */
 416         uchar   *bm;            /* ptr to mask data img->r.min byte; like p->bytey0s */
 417         int             bmskip; /* no. of left bits to skip in *bm */
 418         uchar   *em;            /* ptr to mask data img->r.max.x byte; like p->bytey0e */
 419         int             emskip; /* no. of right bits to skip in *em */
 420 };
 421
 422 typedef struct  Param   Param;
 423 typedef Buffer  Readfn(Param*, uchar*, int);
 424 typedef void    Writefn(Param*, uchar*, Buffer);
 425 typedef Buffer  Calcfn(Buffer, Buffer, Buffer, int, int, int);
 426
 427 enum {
 428         MAXBCACHE = 16
 429 };
 430
 431 /* giant rathole to customize functions with */
 432 struct Param {
 433         Readfn  *replcall;
 434         Readfn  *greymaskcall;
 435         Readfn  *convreadcall;
 436         Writefn *convwritecall;
 437
 438         Memimage *img;
 439         Rectangle       r;
 440         int     dx;     /* of r */
 441         int     needbuf;
 442         int     convgrey;
 443         int     alphaonly;
 444
 445         uchar   *bytey0s;               /* byteaddr(Pt(img->r.min.x, img->r.min.y)) */
 446         uchar   *bytermin;      /* byteaddr(Pt(r.min.x, img->r.min.y)) */
 447         uchar   *bytey0e;               /* byteaddr(Pt(img->r.max.x, img->r.min.y)) */
 448         int             bwidth;
 449
 450         int     replcache;      /* if set, cache buffers */
 451         Buffer  bcache[MAXBCACHE];
 452         ulong   bfilled;
 453         uchar   *bufbase;
 454         int     bufoff;
 455         int     bufdelta;
 456
 457         int     dir;
 458
 459         int     convbufoff;
 460         uchar   *convbuf;
 461         Param   *convdpar;
 462         int     convdx;
 463 };
 464
 465 static uchar *drawbuf;
 466 static int      ndrawbuf;
 467 static int      mdrawbuf;
 468 static Readfn   greymaskread, replread, readptr;
 469 static Writefn  nullwrite;
 470 static Calcfn   alphacalc0, alphacalc14, alphacalc2810, alphacalc3679, alphacalc5, alphacalc11, alphacalcS;
 471 static Calcfn   boolcalc14, boolcalc236789, boolcalc1011;
 472
 473 static Readfn*  readfn(Memimage*);
 474 static Readfn*  readalphafn(Memimage*);
 475 static Writefn* writefn(Memimage*);
 476
 477 static Calcfn*  boolcopyfn(Memimage*, Memimage*);
 478 static Readfn*  convfn(Memimage*, Param*, Memimage*, Param*, int*);
 479 static Readfn*  ptrfn(Memimage*);
 480
 481 static Calcfn *alphacalc[Ncomp] =
 482 {
 483         alphacalc0,             /* Clear */
 484         alphacalc14,            /* DoutS */
 485         alphacalc2810,          /* SoutD */
 486         alphacalc3679,          /* DxorS */
 487         alphacalc14,            /* DinS */
 488         alphacalc5,             /* D */
 489         alphacalc3679,          /* DatopS */
 490         alphacalc3679,          /* DoverS */
 491         alphacalc2810,          /* SinD */
 492         alphacalc3679,          /* SatopD */
 493         alphacalc2810,          /* S */
 494         alphacalc11,            /* SoverD */
 495 };
 496
 497 static Calcfn *boolcalc[Ncomp] =
 498 {
 499         alphacalc0,             /* Clear */
 500         boolcalc14,             /* DoutS */
 501         boolcalc236789,         /* SoutD */
 502         boolcalc236789,         /* DxorS */
 503         boolcalc14,             /* DinS */
 504         alphacalc5,             /* D */
 505         boolcalc236789,         /* DatopS */
 506         boolcalc236789,         /* DoverS */
 507         boolcalc236789,         /* SinD */
 508         boolcalc236789,         /* SatopD */
 509         boolcalc1011,           /* S */
 510         boolcalc1011,           /* SoverD */
 511 };
 512
 513 /*
 514  * Avoid standard Lock, QLock so that can be used in kernel.
 515  */
 516 typedef struct Dbuf Dbuf;
 517 struct Dbuf
 518 {
 519         uchar *p;
 520         int n;
 521         Param spar, mpar, dpar;
 522         int inuse;
 523 };
 524 static Dbuf dbuf[10];
 525
 526 static Dbuf*
 527 allocdbuf(void)
 528 {
 529         int i;
 530
 531         for(i=0; i<nelem(dbuf); i++){
 532                 if(dbuf[i].inuse)
 533                         continue;
 534                 if(!_tas(&dbuf[i].inuse))
 535                         return &dbuf[i];
 536         }
 537         return nil;
 538 }
 539
 540 static void
 541 getparam(Param *p, Memimage *img, Rectangle r, int convgrey, int needbuf, int *ndrawbuf)
 542 {
 543         int nbuf;
 544
 545         memset(p, 0, sizeof *p);
 546
 547         p->img = img;
 548         p->r = r;
 549         p->dx = Dx(r);
 550         p->needbuf = needbuf;
 551         p->convgrey = convgrey;
 552
 553         assert(img->r.min.x <= r.min.x && r.min.x < img->r.max.x);
 554
 555         p->bytey0s = byteaddr(img, Pt(img->r.min.x, img->r.min.y));
 556         p->bytermin = byteaddr(img, Pt(r.min.x, img->r.min.y));
 557         p->bytey0e = byteaddr(img, Pt(img->r.max.x, img->r.min.y));
 558         p->bwidth = sizeof(ulong)*img->width;
 559
 560         assert(p->bytey0s <= p->bytermin && p->bytermin <= p->bytey0e);
 561
 562         if(p->r.min.x == p->img->r.min.x)
 563                 assert(p->bytermin == p->bytey0s);
 564
 565         nbuf = 1;
 566         if((img->flags&Frepl) && Dy(img->r) <= MAXBCACHE && Dy(img->r) < Dy(r)){
 567                 p->replcache = 1;
 568                 nbuf = Dy(img->r);
 569         }
 570         p->bufdelta = 4*p->dx;
 571         p->bufoff = *ndrawbuf;
 572         *ndrawbuf += p->bufdelta*nbuf;
 573 }
 574
 575 static void
 576 clipy(Memimage *img, int *y)
 577 {
 578         int dy;
 579
 580         dy = Dy(img->r);
 581         if(*y == dy)
 582                 *y = 0;
 583         else if(*y == -1)
 584                 *y = dy-1;
 585         assert(0 <= *y && *y < dy);
 586 }
 587
 588 static void
 589 dumpbuf(char *s, Buffer b, int n)
 590 {
 591         int i;
 592         uchar *p;
 593
 594         print("%s", s);
 595         for(i=0; i<n; i++){
 596                 print(" ");
 597                 if(p=b.grey){
 598                         print(" k%.2uX", *p);
 599                         b.grey += b.delta;
 600                 }else{
 601                         if(p=b.red){
 602                                 print(" r%.2uX", *p);
 603                                 b.red += b.delta;
 604                         }
 605                         if(p=b.grn){
 606                                 print(" g%.2uX", *p);
 607                                 b.grn += b.delta;
 608                         }
 609                         if(p=b.blu){
 610                                 print(" b%.2uX", *p);
 611                                 b.blu += b.delta;
 612                         }
 613                 }
 614                 if((p=b.alpha) != &ones){
 615                         print(" α%.2uX", *p);
 616                         b.alpha += b.delta;
 617                 }
 618         }
 619         print("\n");
 620 }
 621
 622 /*
 623  * For each scan line, we expand the pixels from source, mask, and destination
 624  * into byte-aligned red, green, blue, alpha, and grey channels.  If buffering is not
 625  * needed and the channels were already byte-aligned (grey8, rgb24, rgba32, rgb32),
 626  * the readers need not copy the data: they can simply return pointers to the data.
 627  * If the destination image is grey and the source is not, it is converted using the NTSC
 628  * formula.
 629  *
 630  * Once we have all the channels, we call either rgbcalc or greycalc, depending on
 631  * whether the destination image is color.  This is allowed to overwrite the dst buffer (perhaps
 632  * the actual data, perhaps a copy) with its result.  It should only overwrite the dst buffer
 633  * with the same format (i.e. red bytes with red bytes, etc.)  A new buffer is returned from
 634  * the calculator, and that buffer is passed to a function to write it to the destination.
 635  * If the buffer is already pointing at the destination, the writing function is a no-op.
 636  */
 637 #define DBG if(0)
 638 static int
 639 alphadraw(Memdrawparam *par)
 640 {
 641         int isgrey, starty, endy, op;
 642         int needbuf, dsty, srcy, masky;
 643         int y, dir, dx, dy, ndrawbuf;
 644         uchar *drawbuf;
 645         Buffer bsrc, bdst, bmask;
 646         Readfn *rdsrc, *rdmask, *rddst;
 647         Calcfn *calc;
 648         Writefn *wrdst;
 649         Memimage *src, *mask, *dst;
 650         Rectangle r, sr, mr;
 651         Dbuf *z;
 652
 653         r = par->r;
 654         dx = Dx(r);
 655         dy = Dy(r);
 656
 657         z = allocdbuf();
 658         if(z == nil)
 659                 return 0;
 660
 661         src = par->src;
 662         mask = par->mask;
 663         dst = par->dst;
 664         sr = par->sr;
 665         mr = par->mr;
 666         op = par->op;
 667
 668         isgrey = dst->flags&Fgrey;
 669
 670         /*
 671          * Buffering when src and dst are the same bitmap is sufficient but not
 672          * necessary.  There are stronger conditions we could use.  We could
 673          * check to see if the rectangles intersect, and if simply moving in the
 674          * correct y direction can avoid the need to buffer.
 675          */
 676         needbuf = (src->data == dst->data);
 677
 678         ndrawbuf = 0;
 679         getparam(&z->spar, src, sr, isgrey, needbuf, &ndrawbuf);
 680         getparam(&z->dpar, dst, r, isgrey, needbuf, &ndrawbuf);
 681         getparam(&z->mpar, mask, mr, 0, needbuf, &ndrawbuf);
 682
 683         dir = (needbuf && byteaddr(dst, r.min) > byteaddr(src, sr.min)) ? -1 : 1;
 684         z->spar.dir = z->mpar.dir = z->dpar.dir = dir;
 685
 686         /*
 687          * If the mask is purely boolean, we can convert from src to dst format
 688          * when we read src, and then just copy it to dst where the mask tells us to.
 689          * This requires a boolean (1-bit grey) mask and lack of a source alpha channel.
 690          *
 691          * The computation is accomplished by assigning the function pointers as follows:
 692          *      rdsrc - read and convert source into dst format in a buffer
 693          *      rdmask - convert mask to bytes, set pointer to it
 694          *      rddst - fill with pointer to real dst data, but do no reads
 695          *      calc - copy src onto dst when mask says to.
 696          *      wrdst - do nothing
 697          * This is slightly sleazy, since things aren't doing exactly what their names say,
 698          * but it avoids a fair amount of code duplication to make this a case here
 699          * rather than have a separate booldraw.
 700          */
 701 //if(drawdebug) iprint("flag %lud mchan %lux=?%x dd %d\n", src->flags&Falpha, mask->chan, GREY1, dst->depth);
 702         if(!(src->flags&Falpha) && mask->chan == GREY1 && dst->depth >= 8 && op == SoverD){
 703 //if(drawdebug) iprint("boolcopy...");
 704                 rdsrc = convfn(dst, &z->dpar, src, &z->spar, &ndrawbuf);
 705                 rddst = readptr;
 706                 rdmask = readfn(mask);
 707                 calc = boolcopyfn(dst, mask);
 708                 wrdst = nullwrite;
 709         }else{
 710                 /* usual alphadraw parameter fetching */
 711                 rdsrc = readfn(src);
 712                 rddst = readfn(dst);
 713                 wrdst = writefn(dst);
 714                 calc = alphacalc[op];
 715
 716                 /*
 717                  * If there is no alpha channel, we'll ask for a grey channel
 718                  * and pretend it is the alpha.
 719                  */
 720                 if(mask->flags&Falpha){
 721                         rdmask = readalphafn(mask);
 722                         z->mpar.alphaonly = 1;
 723                 }else{
 724                         z->mpar.greymaskcall = readfn(mask);
 725                         z->mpar.convgrey = 1;
 726                         rdmask = greymaskread;
 727
 728                         /*
 729                          * Should really be above, but then boolcopyfns would have
 730                          * to deal with bit alignment, and I haven't written that.
 731                          *
 732                          * This is a common case for things like ellipse drawing.
 733                          * When there's no alpha involved and the mask is boolean,
 734                          * we can avoid all the division and multiplication.
 735                          */
 736                         if(mask->chan == GREY1 && !(src->flags&Falpha))
 737                                 calc = boolcalc[op];
 738                         else if(op == SoverD && !(src->flags&Falpha))
 739                                 calc = alphacalcS;
 740                 }
 741         }
 742
 743         /*
 744          * If the image has a small enough repl rectangle,
 745          * we can just read each line once and cache them.
 746          */
 747         if(z->spar.replcache){
 748                 z->spar.replcall = rdsrc;
 749                 rdsrc = replread;
 750         }
 751         if(z->mpar.replcache){
 752                 z->mpar.replcall = rdmask;
 753                 rdmask = replread;
 754         }
 755
 756         if(z->n < ndrawbuf){
 757                 free(z->p);
 758                 if((z->p = mallocz(ndrawbuf, 0)) == nil){
 759                         z->inuse = 0;
 760                         return 0;
 761                 }
 762                 z->n = ndrawbuf;
 763         }
 764         drawbuf = z->p;
 765
 766         /*
 767          * Before we were saving only offsets from drawbuf in the parameter
 768          * structures; now that drawbuf has been grown to accomodate us,
 769          * we can fill in the pointers.
 770          */
 771         z->spar.bufbase = drawbuf+z->spar.bufoff;
 772         z->mpar.bufbase = drawbuf+z->mpar.bufoff;
 773         z->dpar.bufbase = drawbuf+z->dpar.bufoff;
 774         z->spar.convbuf = drawbuf+z->spar.convbufoff;
 775
 776         if(dir == 1){
 777                 starty = 0;
 778                 endy = dy;
 779         }else{
 780                 starty = dy-1;
 781                 endy = -1;
 782         }
 783
 784         /*
 785          * srcy, masky, and dsty are offsets from the top of their
 786          * respective Rectangles.  they need to be contained within
 787          * the rectangles, so clipy can keep them there without division.
 788          */
 789         srcy = (starty + sr.min.y - src->r.min.y)%Dy(src->r);
 790         masky = (starty + mr.min.y - mask->r.min.y)%Dy(mask->r);
 791         dsty = starty + r.min.y - dst->r.min.y;
 792
 793         assert(0 <= srcy && srcy < Dy(src->r));
 794         assert(0 <= masky && masky < Dy(mask->r));
 795         assert(0 <= dsty && dsty < Dy(dst->r));
 796
 797         for(y=starty; y!=endy; y+=dir, srcy+=dir, masky+=dir, dsty+=dir){
 798                 clipy(src, &srcy);
 799                 clipy(dst, &dsty);
 800                 clipy(mask, &masky);
 801
 802                 bsrc = rdsrc(&z->spar, z->spar.bufbase, srcy);
 803 DBG print("[");
 804                 bmask = rdmask(&z->mpar, z->mpar.bufbase, masky);
 805 DBG print("]\n");
 806                 bdst = rddst(&z->dpar, z->dpar.bufbase, dsty);
 807 DBG             dumpbuf("src", bsrc, dx);
 808 DBG             dumpbuf("mask", bmask, dx);
 809 DBG             dumpbuf("dst", bdst, dx);
 810                 bdst = calc(bdst, bsrc, bmask, dx, isgrey, op);
 811                 wrdst(&z->dpar, z->dpar.bytermin+dsty*z->dpar.bwidth, bdst);
 812         }
 813
 814         z->inuse = 0;
 815         return 1;
 816 }
 817 #undef DBG
 818
 819 static Buffer
 820 alphacalc0(Buffer bdst, Buffer b1, Buffer b2, int dx, int grey, int op)
 821 {
 822         USED(grey);
 823         USED(op);
 824         USED(b1);
 825         USED(b2);
 826         memset(bdst.rgba, 0, dx*bdst.delta);
 827         return bdst;
 828 }
 829
 830 /*
 831  * Do the channels in the buffers match enough
 832  * that we can do word-at-a-time operations
 833  * on the pixels?
 834  */
 835 static int
 836 chanmatch(Buffer *bdst, Buffer *bsrc)
 837 {
 838         uchar *drgb, *srgb;
 839
 840         /*
 841          * first, r, g, b must be in the same place
 842          * in the rgba word.
 843          */
 844         drgb = (uchar*)bdst->rgba;
 845         srgb = (uchar*)bsrc->rgba;
 846         if(bdst->red - drgb != bsrc->red - srgb
 847         || bdst->blu - drgb != bsrc->blu - srgb
 848         || bdst->grn - drgb != bsrc->grn - srgb)
 849                 return 0;
 850
 851         /*
 852          * that implies alpha is in the same place,
 853          * if it is there at all (it might be == &ones).
 854          * if the destination is &ones, we can scribble
 855          * over the rgba slot just fine.
 856          */
 857         if(bdst->alpha == &ones)
 858                 return 1;
 859
 860         /*
 861          * if the destination is not ones but the src is,
 862          * then the simultaneous calculation will use
 863          * bogus bytes from the src's rgba.  no good.
 864          */
 865         if(bsrc->alpha == &ones)
 866                 return 0;
 867
 868         /*
 869          * otherwise, alphas are in the same place.
 870          */
 871         return 1;
 872 }
 873
 874 static Buffer
 875 alphacalc14(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
 876 {
 877         Buffer obdst;
 878         int fd, sadelta;
 879         int i, sa, ma, q;
 880         ulong t, t1;
 881
 882         obdst = bdst;
 883         sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
 884         q = bsrc.delta == 4 && bdst.delta == 4 && chanmatch(&bdst, &bsrc);
 885
 886         for(i=0; i<dx; i++){
 887                 sa = *bsrc.alpha;
 888                 ma = *bmask.alpha;
 889                 fd = CALC11(sa, ma, t);
 890                 if(op == DoutS)
 891                         fd = 255-fd;
 892
 893                 if(grey){
 894                         *bdst.grey = CALC11(fd, *bdst.grey, t);
 895                         bsrc.grey += bsrc.delta;
 896                         bdst.grey += bdst.delta;
 897                 }else{
 898                         if(q){
 899                                 *bdst.rgba = CALC41(fd, *bdst.rgba, t, t1);
 900                                 bsrc.rgba++;
 901                                 bdst.rgba++;
 902                                 bsrc.alpha += sadelta;
 903                                 bmask.alpha += bmask.delta;
 904                                 continue;
 905                         }
 906                         *bdst.red = CALC11(fd, *bdst.red, t);
 907                         *bdst.grn = CALC11(fd, *bdst.grn, t);
 908                         *bdst.blu = CALC11(fd, *bdst.blu, t);
 909                         bsrc.red += bsrc.delta;
 910                         bsrc.blu += bsrc.delta;
 911                         bsrc.grn += bsrc.delta;
 912                         bdst.red += bdst.delta;
 913                         bdst.blu += bdst.delta;
 914                         bdst.grn += bdst.delta;
 915                 }
 916                 if(bdst.alpha != &ones){
 917                         *bdst.alpha = CALC11(fd, *bdst.alpha, t);
 918                         bdst.alpha += bdst.delta;
 919                 }
 920                 bmask.alpha += bmask.delta;
 921                 bsrc.alpha += sadelta;
 922         }
 923         return obdst;
 924 }
 925
 926 static Buffer
 927 alphacalc2810(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
 928 {
 929         Buffer obdst;
 930         int fs, sadelta;
 931         int i, ma, da, q;
 932         ulong t, t1;
 933
 934         obdst = bdst;
 935         sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
 936         q = bsrc.delta == 4 && bdst.delta == 4 && chanmatch(&bdst, &bsrc);
 937
 938         for(i=0; i<dx; i++){
 939                 ma = *bmask.alpha;
 940                 da = *bdst.alpha;
 941                 if(op == SoutD)
 942                         da = 255-da;
 943                 fs = ma;
 944                 if(op != S)
 945                         fs = CALC11(fs, da, t);
 946
 947                 if(grey){
 948                         *bdst.grey = CALC11(fs, *bsrc.grey, t);
 949                         bsrc.grey += bsrc.delta;
 950                         bdst.grey += bdst.delta;
 951                 }else{
 952                         if(q){
 953                                 *bdst.rgba = CALC41(fs, *bsrc.rgba, t, t1);
 954                                 bsrc.rgba++;
 955                                 bdst.rgba++;
 956                                 bmask.alpha += bmask.delta;
 957                                 bdst.alpha += bdst.delta;
 958                                 continue;
 959                         }
 960                         *bdst.red = CALC11(fs, *bsrc.red, t);
 961                         *bdst.grn = CALC11(fs, *bsrc.grn, t);
 962                         *bdst.blu = CALC11(fs, *bsrc.blu, t);
 963                         bsrc.red += bsrc.delta;
 964                         bsrc.blu += bsrc.delta;
 965                         bsrc.grn += bsrc.delta;
 966                         bdst.red += bdst.delta;
 967                         bdst.blu += bdst.delta;
 968                         bdst.grn += bdst.delta;
 969                 }
 970                 if(bdst.alpha != &ones){
 971                         *bdst.alpha = CALC11(fs, *bsrc.alpha, t);
 972                         bdst.alpha += bdst.delta;
 973                 }
 974                 bmask.alpha += bmask.delta;
 975                 bsrc.alpha += sadelta;
 976         }
 977         return obdst;
 978 }
 979
 980 static Buffer
 981 alphacalc3679(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
 982 {
 983         Buffer obdst;
 984         int fs, fd, sadelta;
 985         int i, sa, ma, da, q;
 986         ulong t, t1;
 987
 988         obdst = bdst;
 989         sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
 990         q = bsrc.delta == 4 && bdst.delta == 4 && chanmatch(&bdst, &bsrc);
 991
 992         for(i=0; i<dx; i++){
 993                 sa = *bsrc.alpha;
 994                 ma = *bmask.alpha;
 995                 da = *bdst.alpha;
 996                 if(op == SatopD)
 997                         fs = CALC11(ma, da, t);
 998                 else
 999                         fs = CALC11(ma, 255-da, t);
1000                 if(op == DoverS)
1001                         fd = 255;
1002                 else{
1003                         fd = CALC11(sa, ma, t);
1004                         if(op != DatopS)
1005                                 fd = 255-fd;
1006                 }
1007
1008                 if(grey){
1009                         *bdst.grey = CALC12(fs, *bsrc.grey, fd, *bdst.grey, t);
1010                         bsrc.grey += bsrc.delta;
1011                         bdst.grey += bdst.delta;
1012                 }else{
1013                         if(q){
1014                                 *bdst.rgba = CALC42(fs, *bsrc.rgba, fd, *bdst.rgba, t, t1);
1015                                 bsrc.rgba++;
1016                                 bdst.rgba++;
1017                                 bsrc.alpha += sadelta;
1018                                 bmask.alpha += bmask.delta;
1019                                 bdst.alpha += bdst.delta;
1020                                 continue;
1021                         }
1022                         *bdst.red = CALC12(fs, *bsrc.red, fd, *bdst.red, t);
1023                         *bdst.grn = CALC12(fs, *bsrc.grn, fd, *bdst.grn, t);
1024                         *bdst.blu = CALC12(fs, *bsrc.blu, fd, *bdst.blu, t);
1025                         bsrc.red += bsrc.delta;
1026                         bsrc.blu += bsrc.delta;
1027                         bsrc.grn += bsrc.delta;
1028                         bdst.red += bdst.delta;
1029                         bdst.blu += bdst.delta;
1030                         bdst.grn += bdst.delta;
1031                 }
1032                 if(bdst.alpha != &ones){
1033                         *bdst.alpha = CALC12(fs, sa, fd, da, t);
1034                         bdst.alpha += bdst.delta;
1035                 }
1036                 bmask.alpha += bmask.delta;
1037                 bsrc.alpha += sadelta;
1038         }
1039         return obdst;
1040 }
1041
1042 static Buffer
1043 alphacalc5(Buffer bdst, Buffer b1, Buffer b2, int dx, int grey, int op)
1044 {
1045         USED(dx);
1046         USED(grey);
1047         USED(op);
1048         USED(b1);
1049         USED(b2);
1050         return bdst;
1051 }
1052
1053 static Buffer
1054 alphacalc11(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1055 {
1056         Buffer obdst;
1057         int fd, sadelta;
1058         int i, sa, ma, q;
1059         ulong t, t1;
1060
1061         USED(op);
1062         obdst = bdst;
1063         sadelta = bsrc.alpha == &ones ? 0 : bsrc.delta;
1064         q = bsrc.delta == 4 && bdst.delta == 4 && chanmatch(&bdst, &bsrc);
1065
1066         for(i=0; i<dx; i++){
1067                 sa = *bsrc.alpha;
1068                 ma = *bmask.alpha;
1069                 fd = 255-CALC11(sa, ma, t);
1070
1071                 if(grey){
1072                         *bdst.grey = CALC12(ma, *bsrc.grey, fd, *bdst.grey, t);
1073                         bsrc.grey += bsrc.delta;
1074                         bdst.grey += bdst.delta;
1075                 }else{
1076                         if(q){
1077                                 *bdst.rgba = CALC42(ma, *bsrc.rgba, fd, *bdst.rgba, t, t1);
1078                                 bsrc.rgba++;
1079                                 bdst.rgba++;
1080                                 bsrc.alpha += sadelta;
1081                                 bmask.alpha += bmask.delta;
1082                                 continue;
1083                         }
1084                         *bdst.red = CALC12(ma, *bsrc.red, fd, *bdst.red, t);
1085                         *bdst.grn = CALC12(ma, *bsrc.grn, fd, *bdst.grn, t);
1086                         *bdst.blu = CALC12(ma, *bsrc.blu, fd, *bdst.blu, t);
1087                         bsrc.red += bsrc.delta;
1088                         bsrc.blu += bsrc.delta;
1089                         bsrc.grn += bsrc.delta;
1090                         bdst.red += bdst.delta;
1091                         bdst.blu += bdst.delta;
1092                         bdst.grn += bdst.delta;
1093                 }
1094                 if(bdst.alpha != &ones){
1095                         *bdst.alpha = CALC12(ma, sa, fd, *bdst.alpha, t);
1096                         bdst.alpha += bdst.delta;
1097                 }
1098                 bmask.alpha += bmask.delta;
1099                 bsrc.alpha += sadelta;
1100         }
1101         return obdst;
1102 }
1103
1104 /*
1105 not used yet
1106 source and mask alpha 1
1107 static Buffer
1108 alphacalcS0(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1109 {
1110         Buffer obdst;
1111         int i;
1112
1113         USED(op);
1114         obdst = bdst;
1115         if(bsrc.delta == bdst.delta){
1116                 memmove(bdst.rgba, bsrc.rgba, dx*bdst.delta);
1117                 return obdst;
1118         }
1119         for(i=0; i<dx; i++){
1120                 if(grey){
1121                         *bdst.grey = *bsrc.grey;
1122                         bsrc.grey += bsrc.delta;
1123                         bdst.grey += bdst.delta;
1124                 }else{
1125                         *bdst.red = *bsrc.red;
1126                         *bdst.grn = *bsrc.grn;
1127                         *bdst.blu = *bsrc.blu;
1128                         bsrc.red += bsrc.delta;
1129                         bsrc.blu += bsrc.delta;
1130                         bsrc.grn += bsrc.delta;
1131                         bdst.red += bdst.delta;
1132                         bdst.blu += bdst.delta;
1133                         bdst.grn += bdst.delta;
1134                 }
1135                 if(bdst.alpha != &ones){
1136                         *bdst.alpha = 255;
1137                         bdst.alpha += bdst.delta;
1138                 }
1139         }
1140         return obdst;
1141 }
1142 */
1143
1144 /* source alpha 1 */
1145 static Buffer
1146 alphacalcS(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1147 {
1148         Buffer obdst;
1149         int fd;
1150         int i, ma;
1151         ulong t;
1152
1153         USED(op);
1154         obdst = bdst;
1155
1156         for(i=0; i<dx; i++){
1157                 ma = *bmask.alpha;
1158                 fd = 255-ma;
1159
1160                 if(grey){
1161                         *bdst.grey = CALC12(ma, *bsrc.grey, fd, *bdst.grey, t);
1162                         bsrc.grey += bsrc.delta;
1163                         bdst.grey += bdst.delta;
1164                 }else{
1165                         *bdst.red = CALC12(ma, *bsrc.red, fd, *bdst.red, t);
1166                         *bdst.grn = CALC12(ma, *bsrc.grn, fd, *bdst.grn, t);
1167                         *bdst.blu = CALC12(ma, *bsrc.blu, fd, *bdst.blu, t);
1168                         bsrc.red += bsrc.delta;
1169                         bsrc.blu += bsrc.delta;
1170                         bsrc.grn += bsrc.delta;
1171                         bdst.red += bdst.delta;
1172                         bdst.blu += bdst.delta;
1173                         bdst.grn += bdst.delta;
1174                 }
1175                 if(bdst.alpha != &ones){
1176                         *bdst.alpha = ma+CALC11(fd, *bdst.alpha, t);
1177                         bdst.alpha += bdst.delta;
1178                 }
1179                 bmask.alpha += bmask.delta;
1180         }
1181         return obdst;
1182 }
1183
1184 static Buffer
1185 boolcalc14(Buffer bdst, Buffer b1, Buffer bmask, int dx, int grey, int op)
1186 {
1187         Buffer obdst;
1188         int i, ma, zero;
1189
1190         USED(b1);
1191
1192         obdst = bdst;
1193
1194         for(i=0; i<dx; i++){
1195                 ma = *bmask.alpha;
1196                 zero = ma ? op == DoutS : op == DinS;
1197
1198                 if(grey){
1199                         if(zero)
1200                                 *bdst.grey = 0;
1201                         bdst.grey += bdst.delta;
1202                 }else{
1203                         if(zero)
1204                                 *bdst.red = *bdst.grn = *bdst.blu = 0;
1205                         bdst.red += bdst.delta;
1206                         bdst.blu += bdst.delta;
1207                         bdst.grn += bdst.delta;
1208                 }
1209                 bmask.alpha += bmask.delta;
1210                 if(bdst.alpha != &ones){
1211                         if(zero)
1212                                 *bdst.alpha = 0;
1213                         bdst.alpha += bdst.delta;
1214                 }
1215         }
1216         return obdst;
1217 }
1218
1219 static Buffer
1220 boolcalc236789(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1221 {
1222         Buffer obdst;
1223         int fs, fd;
1224         int i, ma, da, zero;
1225         ulong t;
1226
1227         obdst = bdst;
1228         zero = !(op&1);
1229
1230         for(i=0; i<dx; i++){
1231                 ma = *bmask.alpha;
1232                 da = *bdst.alpha;
1233                 fs = da;
1234                 if(op&2)
1235                         fs = 255-da;
1236                 fd = 0;
1237                 if(op&4)
1238                         fd = 255;
1239
1240                 if(grey){
1241                         if(ma)
1242                                 *bdst.grey = CALC12(fs, *bsrc.grey, fd, *bdst.grey, t);
1243                         else if(zero)
1244                                 *bdst.grey = 0;
1245                         bsrc.grey += bsrc.delta;
1246                         bdst.grey += bdst.delta;
1247                 }else{
1248                         if(ma){
1249                                 *bdst.red = CALC12(fs, *bsrc.red, fd, *bdst.red, t);
1250                                 *bdst.grn = CALC12(fs, *bsrc.grn, fd, *bdst.grn, t);
1251                                 *bdst.blu = CALC12(fs, *bsrc.blu, fd, *bdst.blu, t);
1252                         }
1253                         else if(zero)
1254                                 *bdst.red = *bdst.grn = *bdst.blu = 0;
1255                         bsrc.red += bsrc.delta;
1256                         bsrc.blu += bsrc.delta;
1257                         bsrc.grn += bsrc.delta;
1258                         bdst.red += bdst.delta;
1259                         bdst.blu += bdst.delta;
1260                         bdst.grn += bdst.delta;
1261                 }
1262                 bmask.alpha += bmask.delta;
1263                 if(bdst.alpha != &ones){
1264                         if(ma)
1265                                 *bdst.alpha = fs+CALC11(fd, da, t);
1266                         else if(zero)
1267                                 *bdst.alpha = 0;
1268                         bdst.alpha += bdst.delta;
1269                 }
1270         }
1271         return obdst;
1272 }
1273
1274 static Buffer
1275 boolcalc1011(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int grey, int op)
1276 {
1277         Buffer obdst;
1278         int i, ma, zero;
1279
1280         obdst = bdst;
1281         zero = !(op&1);
1282
1283         for(i=0; i<dx; i++){
1284                 ma = *bmask.alpha;
1285
1286                 if(grey){
1287                         if(ma)
1288                                 *bdst.grey = *bsrc.grey;
1289                         else if(zero)
1290                                 *bdst.grey = 0;
1291                         bsrc.grey += bsrc.delta;
1292                         bdst.grey += bdst.delta;
1293                 }else{
1294                         if(ma){
1295                                 *bdst.red = *bsrc.red;
1296                                 *bdst.grn = *bsrc.grn;
1297                                 *bdst.blu = *bsrc.blu;
1298                         }
1299                         else if(zero)
1300                                 *bdst.red = *bdst.grn = *bdst.blu = 0;
1301                         bsrc.red += bsrc.delta;
1302                         bsrc.blu += bsrc.delta;
1303                         bsrc.grn += bsrc.delta;
1304                         bdst.red += bdst.delta;
1305                         bdst.blu += bdst.delta;
1306                         bdst.grn += bdst.delta;
1307                 }
1308                 bmask.alpha += bmask.delta;
1309                 if(bdst.alpha != &ones){
1310                         if(ma)
1311                                 *bdst.alpha = 255;
1312                         else if(zero)
1313                                 *bdst.alpha = 0;
1314                         bdst.alpha += bdst.delta;
1315                 }
1316         }
1317         return obdst;
1318 }
1319 /*
1320  * Replicated cached scan line read.  Call the function listed in the Param,
1321  * but cache the result so that for replicated images we only do the work once.
1322  */
1323 static Buffer
1324 replread(Param *p, uchar *s, int y)
1325 {
1326         Buffer *b;
1327
1328         USED(s);
1329         b = &p->bcache[y];
1330         if((p->bfilled & (1<<y)) == 0){
1331                 p->bfilled |= 1<<y;
1332                 *b = p->replcall(p, p->bufbase+y*p->bufdelta, y);
1333         }
1334         return *b;
1335 }
1336
1337 /*
1338  * Alpha reading function that simply relabels the grey pointer.
1339  */
1340 static Buffer
1341 greymaskread(Param *p, uchar *buf, int y)
1342 {
1343         Buffer b;
1344
1345         b = p->greymaskcall(p, buf, y);
1346         b.alpha = b.grey;
1347         return b;
1348 }
1349
1350 #define DBG if(0)
1351 static Buffer
1352 readnbit(Param *p, uchar *buf, int y)
1353 {
1354         Buffer b;
1355         Memimage *img;
1356         uchar *repl, *r, *w, *ow, bits;
1357         int i, n, sh, depth, x, dx, npack, nbits;
1358
1359         b.rgba = (ulong*)buf;
1360         b.grey = w = buf;
1361         b.red = b.blu = b.grn = w;
1362         b.alpha = &ones;
1363         b.delta = 1;
1364
1365         dx = p->dx;
1366         img = p->img;
1367         depth = img->depth;
1368         repl = &replbit[depth][0];
1369         npack = 8/depth;
1370         sh = 8-depth;
1371
1372         /* copy from p->r.min.x until end of repl rectangle */
1373         x = p->r.min.x;
1374         n = dx;
1375         if(n > p->img->r.max.x - x)
1376                 n = p->img->r.max.x - x;
1377
1378         r = p->bytermin + y*p->bwidth;
1379 DBG print("readnbit dx %d %p=%p+%d*%d, *r=%d fetch %d ", dx, r, p->bytermin, y, p->bwidth, *r, n);
1380         bits = *r++;
1381         nbits = 8;
1382         if(i=x&(npack-1)){
1383 DBG print("throwaway %d...", i);
1384                 bits <<= depth*i;
1385                 nbits -= depth*i;
1386         }
1387         for(i=0; i<n; i++){
1388                 if(nbits == 0){
1389 DBG print("(%.2ux)...", *r);
1390                         bits = *r++;
1391                         nbits = 8;
1392                 }
1393                 *w++ = repl[bits>>sh];
1394 DBG print("bit %x...", repl[bits>>sh]);
1395                 bits <<= depth;
1396                 nbits -= depth;
1397         }
1398         dx -= n;
1399         if(dx == 0)
1400                 return b;
1401
1402         assert(x+i == p->img->r.max.x);
1403
1404         /* copy from beginning of repl rectangle until where we were before. */
1405         x = p->img->r.min.x;
1406         n = dx;
1407         if(n > p->r.min.x - x)
1408                 n = p->r.min.x - x;
1409
1410         r = p->bytey0s + y*p->bwidth;
1411 DBG print("x=%d r=%p...", x, r);
1412         bits = *r++;
1413         nbits = 8;
1414         if(i=x&(npack-1)){
1415                 bits <<= depth*i;
1416                 nbits -= depth*i;
1417         }
1418 DBG print("nbits=%d...", nbits);
1419         for(i=0; i<n; i++){
1420                 if(nbits == 0){
1421                         bits = *r++;
1422                         nbits = 8;
1423                 }
1424                 *w++ = repl[bits>>sh];
1425 DBG print("bit %x...", repl[bits>>sh]);
1426                 bits <<= depth;
1427                 nbits -= depth;
1428 DBG print("bits %x nbits %d...", bits, nbits);
1429         }
1430         dx -= n;
1431         if(dx == 0)
1432                 return b;
1433
1434         assert(dx > 0);
1435         /* now we have exactly one full scan line: just replicate the buffer itself until we are done */
1436         ow = buf;
1437         while(dx--)
1438                 *w++ = *ow++;
1439
1440         return b;
1441 }
1442 #undef DBG
1443
1444 #define DBG if(0)
1445 static void
1446 writenbit(Param *p, uchar *w, Buffer src)
1447 {
1448         uchar *r;
1449         ulong bits;
1450         int i, sh, depth, npack, nbits, x, ex;
1451
1452         assert(src.grey != nil && src.delta == 1);
1453
1454         x = p->r.min.x;
1455         ex = x+p->dx;
1456         depth = p->img->depth;
1457         npack = 8/depth;
1458
1459         i=x&(npack-1);
1460         bits = i ? (*w >> (8-depth*i)) : 0;
1461         nbits = depth*i;
1462         sh = 8-depth;
1463         r = src.grey;
1464
1465         for(; x<ex; x++){
1466                 bits <<= depth;
1467 DBG print(" %x", *r);
1468                 bits |= (*r++ >> sh);
1469                 nbits += depth;
1470                 if(nbits == 8){
1471                         *w++ = bits;
1472                         nbits = 0;
1473                 }
1474         }
1475
1476         if(nbits){
1477                 sh = 8-nbits;
1478                 bits <<= sh;
1479                 bits |= *w & ((1<<sh)-1);
1480                 *w = bits;
1481         }
1482 DBG print("\n");
1483         return;
1484 }
1485 #undef DBG
1486
1487 static Buffer
1488 readcmap(Param *p, uchar *buf, int y)
1489 {
1490         Buffer b;
1491         int a, convgrey, copyalpha, dx, i, m;
1492         uchar *q, *cmap, *begin, *end, *r, *w;
1493
1494         begin = p->bytey0s + y*p->bwidth;
1495         r = p->bytermin + y*p->bwidth;
1496         end = p->bytey0e + y*p->bwidth;
1497         cmap = p->img->cmap->cmap2rgb;
1498         convgrey = p->convgrey;
1499         copyalpha = (p->img->flags&Falpha) != 0;
1500
1501         w = buf;
1502         dx = p->dx;
1503         if(copyalpha){
1504                 b.alpha = buf++;
1505                 a = p->img->shift[CAlpha]/8;
1506                 m = p->img->shift[CMap]/8;
1507                 for(i=0; i<dx; i++){
1508                         *w++ = r[a];
1509                         q = cmap+r[m]*3;
1510                         r += 2;
1511                         if(r == end)
1512                                 r = begin;
1513                         if(convgrey){
1514                                 *w++ = RGB2K(q[0], q[1], q[2]);
1515                         }else{
1516                                 *w++ = q[2];    /* blue */
1517                                 *w++ = q[1];    /* green */
1518                                 *w++ = q[0];    /* red */
1519                         }
1520                 }
1521         }else{
1522                 b.alpha = &ones;
1523                 for(i=0; i<dx; i++){
1524                         q = cmap+*r++*3;
1525                         if(r == end)
1526                                 r = begin;
1527                         if(convgrey){
1528                                 *w++ = RGB2K(q[0], q[1], q[2]);
1529                         }else{
1530                                 *w++ = q[2];    /* blue */
1531                                 *w++ = q[1];    /* green */
1532                                 *w++ = q[0];    /* red */
1533                         }
1534                 }
1535         }
1536
1537         b.rgba = (ulong*)(buf-copyalpha);
1538
1539         if(convgrey){
1540                 b.grey = buf;
1541                 b.red = b.blu = b.grn = buf;
1542                 b.delta = 1+copyalpha;
1543         }else{
1544                 b.blu = buf;
1545                 b.grn = buf+1;
1546                 b.red = buf+2;
1547                 b.grey = nil;
1548                 b.delta = 3+copyalpha;
1549         }
1550         return b;
1551 }
1552
1553 static void
1554 writecmap(Param *p, uchar *w, Buffer src)
1555 {
1556         uchar *cmap, *red, *grn, *blu, *alpha;
1557         int i, dx, delta, a, m;
1558
1559         cmap = p->img->cmap->rgb2cmap;
1560
1561         delta = src.delta;
1562         red= src.red;
1563         grn = src.grn;
1564         blu = src.blu;
1565
1566         dx = p->dx;
1567         if(p->img->flags&Falpha){
1568                 alpha = src.alpha;
1569                 m = p->img->shift[CMap]/8;
1570                 a = p->img->shift[CAlpha]/8;
1571                 for(i=0; i<dx; i++, red+=delta, grn+=delta, blu+=delta, w+=2){
1572                         w[a] = *alpha;
1573                         if(alpha != &ones)
1574                                 alpha+=delta;
1575                         w[m] = cmap[(*red>>4)*256+(*grn>>4)*16+(*blu>>4)];
1576                 }
1577         } else {
1578                 for(i=0; i<dx; i++, red+=delta, grn+=delta, blu+=delta)
1579                         *w++ = cmap[(*red>>4)*256+(*grn>>4)*16+(*blu>>4)];
1580         }
1581 }
1582
1583 #define DBG if(0)
1584 static Buffer
1585 readbyte(Param *p, uchar *buf, int y)
1586 {
1587         Buffer b;
1588         Memimage *img;
1589         int dx, isgrey, convgrey, alphaonly, copyalpha, i, nb;
1590         uchar *begin, *end, *r, *w, *rrepl, *grepl, *brepl, *arepl, *krepl;
1591         uchar ured, ugrn, ublu;
1592         ulong u;
1593
1594         img = p->img;
1595         begin = p->bytey0s + y*p->bwidth;
1596         r = p->bytermin + y*p->bwidth;
1597         end = p->bytey0e + y*p->bwidth;
1598
1599         w = buf;
1600         dx = p->dx;
1601         nb = img->depth/8;
1602
1603         convgrey = p->convgrey; /* convert rgb to grey */
1604         isgrey = img->flags&Fgrey;
1605         alphaonly = p->alphaonly;
1606         copyalpha = (img->flags&Falpha) != 0;
1607
1608 DBG print("copyalpha %d alphaonly %d convgrey %d isgrey %d\n", copyalpha, alphaonly, convgrey, isgrey);
1609         /* if we can, avoid processing everything */
1610         if(!(img->flags&Frepl) && !convgrey && (img->flags&Fbytes)){
1611                 memset(&b, 0, sizeof b);
1612                 if(p->needbuf){
1613                         memmove(buf, r, dx*nb);
1614                         r = buf;
1615                 }
1616                 b.rgba = (ulong*)r;
1617                 if(copyalpha)
1618                         b.alpha = r+img->shift[CAlpha]/8;
1619                 else
1620                         b.alpha = &ones;
1621                 if(isgrey){
1622                         b.grey = r+img->shift[CGrey]/8;
1623                         b.red = b.grn = b.blu = b.grey;
1624                 }else{
1625                         b.red = r+img->shift[CRed]/8;
1626                         b.grn = r+img->shift[CGreen]/8;
1627                         b.blu = r+img->shift[CBlue]/8;
1628                 }
1629                 b.delta = nb;
1630                 return b;
1631         }
1632
1633 DBG print("2\n");
1634         rrepl = replbit[img->nbits[CRed]];
1635         grepl = replbit[img->nbits[CGreen]];
1636         brepl = replbit[img->nbits[CBlue]];
1637         arepl = replbit[img->nbits[CAlpha]];
1638         krepl = replbit[img->nbits[CGrey]];
1639
1640         for(i=0; i<dx; i++){
1641                 u = r[0] | (r[1]<<8) | (r[2]<<16) | (r[3]<<24);
1642                 if(copyalpha) {
1643                         *w++ = arepl[(u>>img->shift[CAlpha]) & img->mask[CAlpha]];
1644 DBG print("a %x\n", w[-1]);
1645                 }
1646
1647                 if(isgrey)
1648                         *w++ = krepl[(u >> img->shift[CGrey]) & img->mask[CGrey]];
1649                 else if(!alphaonly){
1650                         ured = rrepl[(u >> img->shift[CRed]) & img->mask[CRed]];
1651                         ugrn = grepl[(u >> img->shift[CGreen]) & img->mask[CGreen]];
1652                         ublu = brepl[(u >> img->shift[CBlue]) & img->mask[CBlue]];
1653                         if(convgrey){
1654 DBG print("g %x %x %x\n", ured, ugrn, ublu);
1655                                 *w++ = RGB2K(ured, ugrn, ublu);
1656 DBG print("%x\n", w[-1]);
1657                         }else{
1658                                 w[0] = ublu;
1659                                 w[1] = ugrn;
1660                                 w[2] = ured;
1661                                 w += 3;
1662                         }
1663                 }
1664                 r += nb;
1665                 if(r == end)
1666                         r = begin;
1667         }
1668
1669         b.alpha = copyalpha ? buf : &ones;
1670         b.rgba = (ulong*)buf;
1671         if(alphaonly){
1672                 b.red = b.grn = b.blu = b.grey = nil;
1673                 if(!copyalpha)
1674                         b.rgba = nil;
1675                 b.delta = 1;
1676         }else if(isgrey || convgrey){
1677                 b.grey = buf+copyalpha;
1678                 b.red = b.grn = b.blu = buf+copyalpha;
1679                 b.delta = copyalpha+1;
1680 DBG print("alpha %x grey %x\n", *b.alpha, *b.grey);
1681         }else{
1682                 b.blu = buf+copyalpha;
1683                 b.grn = buf+copyalpha+1;
1684                 b.grey = nil;
1685                 b.red = buf+copyalpha+2;
1686                 b.delta = copyalpha+3;
1687         }
1688         return b;
1689 }
1690 #undef DBG
1691
1692 #define DBG if(0)
1693 static void
1694 writebyte(Param *p, uchar *w, Buffer src)
1695 {
1696         Memimage *img;
1697         int i, isalpha, isgrey, nb, delta, dx, adelta;
1698         uchar *red, *grn, *blu, *grey, *alpha;
1699         ulong u, mask;
1700
1701         img = p->img;
1702
1703         red = src.red;
1704         grn = src.grn;
1705         blu = src.blu;
1706         alpha = src.alpha;
1707         delta = src.delta;
1708         grey = src.grey;
1709         dx = p->dx;
1710
1711         nb = img->depth/8;
1712
1713         isalpha = img->flags&Falpha;
1714         isgrey = img->flags&Fgrey;
1715         adelta = src.delta;
1716
1717         if(isalpha && alpha == &ones)
1718                 adelta = 0;
1719
1720         if((img->flags&Fbytes) != 0){
1721                 int ogry, ored, ogrn, oblu, oalp;
1722
1723                 ogry = img->shift[CGrey]/8;
1724                 ored = img->shift[CRed]/8;
1725                 ogrn = img->shift[CGreen]/8;
1726                 oblu = img->shift[CBlue]/8;
1727                 oalp = img->shift[CAlpha]/8;
1728
1729                 for(i=0; i<dx; i++){
1730                         if(isgrey){
1731                                 w[ogry] = *grey;
1732                                 grey += delta;
1733                         } else {
1734                                 w[ored] = *red;
1735                                 w[ogrn] = *grn;
1736                                 w[oblu] = *blu;
1737                                 red += delta;
1738                                 grn += delta;
1739                                 blu += delta;
1740                         }
1741                         if(isalpha){
1742                                 w[oalp] = *alpha;
1743                                 alpha += adelta;
1744                         }
1745                         w += nb;
1746                 }
1747                 return;
1748         }
1749
1750         mask = (nb==4) ? 0 : ~((1<<img->depth)-1);
1751         for(i=0; i<dx; i++){
1752                 u = w[0] | (w[1]<<8) | (w[2]<<16) | (w[3]<<24);
1753 DBG print("u %.8lux...", u);
1754                 u &= mask;
1755 DBG print("&mask %.8lux...", u);
1756                 if(isgrey){
1757                         u |= ((*grey >> (8-img->nbits[CGrey])) & img->mask[CGrey]) << img->shift[CGrey];
1758 DBG print("|grey %.8lux...", u);
1759                         grey += delta;
1760                 }else{
1761                         u |= ((*red >> (8-img->nbits[CRed])) & img->mask[CRed]) << img->shift[CRed];
1762                         u |= ((*grn >> (8-img->nbits[CGreen])) & img->mask[CGreen]) << img->shift[CGreen];
1763                         u |= ((*blu >> (8-img->nbits[CBlue])) & img->mask[CBlue]) << img->shift[CBlue];
1764                         red += delta;
1765                         grn += delta;
1766                         blu += delta;
1767 DBG print("|rgb %.8lux...", u);
1768                 }
1769
1770                 if(isalpha){
1771                         u |= ((*alpha >> (8-img->nbits[CAlpha])) & img->mask[CAlpha]) << img->shift[CAlpha];
1772                         alpha += adelta;
1773 DBG print("|alpha %.8lux...", u);
1774                 }
1775
1776                 w[0] = u;
1777                 w[1] = u>>8;
1778                 w[2] = u>>16;
1779                 w[3] = u>>24;
1780                 w += nb;
1781         }
1782 }
1783 #undef DBG
1784
1785 static Readfn*
1786 readfn(Memimage *img)
1787 {
1788         if(img->depth < 8)
1789                 return readnbit;
1790         if(img->nbits[CMap] == 8)
1791                 return readcmap;
1792         return readbyte;
1793 }
1794
1795 static Readfn*
1796 readalphafn(Memimage *m)
1797 {
1798         USED(m);
1799         return readbyte;
1800 }
1801
1802 static Writefn*
1803 writefn(Memimage *img)
1804 {
1805         if(img->depth < 8)
1806                 return writenbit;
1807         if(img->nbits[CMap] == 8)
1808                 return writecmap;
1809         return writebyte;
1810 }
1811
1812 static void
1813 nullwrite(Param *p, uchar *s, Buffer b)
1814 {
1815         USED(p);
1816         USED(s);
1817         USED(b);
1818 }
1819
1820 static Buffer
1821 readptr(Param *p, uchar *s, int y)
1822 {
1823         Buffer b;
1824         uchar *q;
1825
1826         USED(s);
1827         q = p->bytermin + y*p->bwidth;
1828         b.red = q;      /* ptr to data */
1829         b.grn = b.blu = b.grey = nil;
1830         b.alpha = &ones;
1831         b.rgba = (ulong*)q;
1832         b.delta = p->img->depth/8;
1833         return b;
1834 }
1835
1836 static Buffer
1837 boolmemmove(Buffer bdst, Buffer bsrc, Buffer b1, int dx, int i, int o)
1838 {
1839         USED(i);
1840         USED(o);
1841         USED(b1);
1842         USED(bsrc);
1843         memmove(bdst.red, bsrc.red, dx*bdst.delta);
1844         return bdst;
1845 }
1846
1847 static Buffer
1848 boolcopy8(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1849 {
1850         uchar *m, *r, *w, *ew;
1851
1852         USED(i);
1853         USED(o);
1854         m = bmask.grey;
1855         w = bdst.red;
1856         r = bsrc.red;
1857         ew = w+dx;
1858         for(; w < ew; w++,r++)
1859                 if(*m++)
1860                         *w = *r;
1861         return bdst;    /* not used */
1862 }
1863
1864 static Buffer
1865 boolcopy16(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1866 {
1867         uchar *m;
1868         ushort *r, *w, *ew;
1869
1870         USED(i);
1871         USED(o);
1872         m = bmask.grey;
1873         w = (ushort*)bdst.red;
1874         r = (ushort*)bsrc.red;
1875         ew = w+dx;
1876         for(; w < ew; w++,r++)
1877                 if(*m++)
1878                         *w = *r;
1879         return bdst;    /* not used */
1880 }
1881
1882 static Buffer
1883 boolcopy24(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1884 {
1885         uchar *m;
1886         uchar *r, *w, *ew;
1887
1888         USED(i);
1889         USED(o);
1890         m = bmask.grey;
1891         w = bdst.red;
1892         r = bsrc.red;
1893         ew = w+dx*3;
1894         while(w < ew){
1895                 if(*m++){
1896                         *w++ = *r++;
1897                         *w++ = *r++;
1898                         *w++ = *r++;
1899                 }else{
1900                         w += 3;
1901                         r += 3;
1902                 }
1903         }
1904         return bdst;    /* not used */
1905 }
1906
1907 static Buffer
1908 boolcopy32(Buffer bdst, Buffer bsrc, Buffer bmask, int dx, int i, int o)
1909 {
1910         uchar *m;
1911         ulong *r, *w, *ew;
1912
1913         USED(i);
1914         USED(o);
1915         m = bmask.grey;
1916         w = (ulong*)bdst.red;
1917         r = (ulong*)bsrc.red;
1918         ew = w+dx;
1919         for(; w < ew; w++,r++)
1920                 if(*m++)
1921                         *w = *r;
1922         return bdst;    /* not used */
1923 }
1924
1925 static Buffer
1926 genconv(Param *p, uchar *buf, int y)
1927 {
1928         Buffer b;
1929         int nb;
1930         uchar *r, *w, *ew;
1931
1932         /* read from source into RGB format in convbuf */
1933         b = p->convreadcall(p, p->convbuf, y);
1934
1935         /* write RGB format into dst format in buf */
1936         p->convwritecall(p->convdpar, buf, b);
1937
1938         if(p->convdx){
1939                 nb = p->convdpar->img->depth/8;
1940                 r = buf;
1941                 w = buf+nb*p->dx;
1942                 ew = buf+nb*p->convdx;
1943                 while(w<ew)
1944                         *w++ = *r++;
1945         }
1946
1947         b.red = buf;
1948         b.blu = b.grn = b.grey = nil;
1949         b.alpha = &ones;
1950         b.rgba = (ulong*)buf;
1951         b.delta = 0;
1952
1953         return b;
1954 }
1955
1956 static Readfn*
1957 convfn(Memimage *dst, Param *dpar, Memimage *src, Param *spar, int *ndrawbuf)
1958 {
1959         if(dst->chan == src->chan && !(src->flags&Frepl)){
1960 //if(drawdebug) iprint("readptr...");
1961                 return readptr;
1962         }
1963
1964         if(dst->chan==CMAP8 && (src->chan==GREY1||src->chan==GREY2||src->chan==GREY4)){
1965                 /* cheat because we know the replicated value is exactly the color map entry. */
1966 //if(drawdebug) iprint("Readnbit...");
1967                 return readnbit;
1968         }
1969
1970         spar->convreadcall = readfn(src);
1971         spar->convwritecall = writefn(dst);
1972         spar->convdpar = dpar;
1973
1974         /* allocate a conversion buffer */
1975         spar->convbufoff = *ndrawbuf;
1976         *ndrawbuf += spar->dx*4;
1977
1978         if(spar->dx > Dx(spar->img->r)){
1979                 spar->convdx = spar->dx;
1980                 spar->dx = Dx(spar->img->r);
1981         }
1982
1983 //if(drawdebug) iprint("genconv...");
1984         return genconv;
1985 }
1986
1987 static ulong
1988 pixelbits(Memimage *i, Point pt)
1989 {
1990         uchar *p;
1991         ulong val;
1992         int off, bpp, npack;
1993
1994         val = 0;
1995         p = byteaddr(i, pt);
1996         switch(bpp=i->depth){
1997         case 1:
1998         case 2:
1999         case 4:
2000                 npack = 8/bpp;
2001                 off = pt.x%npack;
2002                 val = p[0] >> bpp*(npack-1-off);
2003                 val &= (1<<bpp)-1;
2004                 break;
2005         case 8:
2006                 val = p[0];
2007                 break;
2008         case 16:
2009                 val = p[0]|(p[1]<<8);
2010                 break;
2011         case 24:
2012                 val = p[0]|(p[1]<<8)|(p[2]<<16);
2013                 break;
2014         case 32:
2015                 val = p[0]|(p[1]<<8)|(p[2]<<16)|(p[3]<<24);
2016                 break;
2017         }
2018         while(bpp<32){
2019                 val |= val<<bpp;
2020                 bpp *= 2;
2021         }
2022         return val;
2023 }
2024
2025 static Calcfn*
2026 boolcopyfn(Memimage *img, Memimage *mask)
2027 {
2028         if(mask->flags&Frepl && Dx(mask->r)==1 && Dy(mask->r)==1 && pixelbits(mask, mask->r.min)==~0)
2029                 return boolmemmove;
2030
2031         switch(img->depth){
2032         case 8:
2033                 return boolcopy8;
2034         case 16:
2035                 return boolcopy16;
2036         case 24:
2037                 return boolcopy24;
2038         case 32:
2039                 return boolcopy32;
2040         default:
2041                 assert(0 /* boolcopyfn */);
2042         }
2043         return nil;
2044 }
2045
2046 /*
2047  * Optimized draw for filling and scrolling; uses memset and memmove.
2048  */
2049 static void
2050 memsets(void *vp, ushort val, int n)
2051 {
2052         ushort *p, *ep;
2053         uchar b[2];
2054
2055         /* make little endian */
2056         b[0] = val;
2057         b[1] = val>>8;
2058         val = *(ushort*)b;
2059
2060         p = vp;
2061         ep = p+n;
2062         while(p<ep)
2063                 *p++ = val;
2064 }
2065
2066 static void
2067 memsetl(void *vp, ulong val, int n)
2068 {
2069         ulong *p, *ep;
2070         uchar b[4];
2071
2072         /* make little endian */
2073         b[0] = val;
2074         b[1] = val>>8;
2075         b[2] = val>>16;
2076         b[3] = val>>24;
2077         val = *(ulong*)b;
2078
2079         p = vp;
2080         ep = p+n;
2081         while(p<ep)
2082                 *p++ = val;
2083 }
2084
2085 static void
2086 memset24(void *vp, ulong val, int n)
2087 {
2088         uchar *p, *ep;
2089         uchar a,b,c;
2090
2091         a = val;
2092         b = val>>8;
2093         c = val>>16;
2094
2095         p = vp;
2096         ep = p+3*n;
2097         while(p<ep){
2098                 p[0] = a;
2099                 p[1] = b;
2100                 p[2] = c;
2101                 p += 3;
2102         }
2103 }
2104
2105 static ulong
2106 imgtorgba(Memimage *img, ulong val)
2107 {
2108         uchar r, g, b, a;
2109         int nb, ov, v;
2110         ulong chan;
2111         uchar *p;
2112
2113         a = 0xFF;
2114         r = g = b = 0xAA;       /* garbage */
2115         for(chan=img->chan; chan; chan>>=8){
2116                 nb = NBITS(chan);
2117                 ov = v = val&((1<<nb)-1);
2118                 val >>= nb;
2119
2120                 while(nb < 8){
2121                         v |= v<<nb;
2122                         nb *= 2;
2123                 }
2124                 v >>= (nb-8);
2125
2126                 switch(TYPE(chan)){
2127                 case CRed:
2128                         r = v;
2129                         break;
2130                 case CGreen:
2131                         g = v;
2132                         break;
2133                 case CBlue:
2134                         b = v;
2135                         break;
2136                 case CAlpha:
2137                         a = v;
2138                         break;
2139                 case CGrey:
2140                         r = g = b = v;
2141                         break;
2142                 case CMap:
2143                         p = img->cmap->cmap2rgb+3*ov;
2144                         r = p[0];
2145                         g = p[1];
2146                         b = p[2];
2147                         break;
2148                 }
2149         }
2150         return (r<<24)|(g<<16)|(b<<8)|a;
2151 }
2152
2153 static ulong
2154 rgbatoimg(Memimage *img, ulong rgba)
2155 {
2156         ulong chan;
2157         int d, nb;
2158         ulong v;
2159         uchar *p, r, g, b, a, m;
2160
2161         v = 0;
2162         r = rgba>>24;
2163         g = rgba>>16;
2164         b = rgba>>8;
2165         a = rgba;
2166         d = 0;
2167         for(chan=img->chan; chan; chan>>=8){
2168                 nb = NBITS(chan);
2169                 switch(TYPE(chan)){
2170                 case CRed:
2171                         v |= (r>>(8-nb))<<d;
2172                         break;
2173                 case CGreen:
2174                         v |= (g>>(8-nb))<<d;
2175                         break;
2176                 case CBlue:
2177                         v |= (b>>(8-nb))<<d;
2178                         break;
2179                 case CAlpha:
2180                         v |= (a>>(8-nb))<<d;
2181                         break;
2182                 case CMap:
2183                         p = img->cmap->rgb2cmap;
2184                         m = p[(r>>4)*256+(g>>4)*16+(b>>4)];
2185                         v |= (m>>(8-nb))<<d;
2186                         break;
2187                 case CGrey:
2188                         m = RGB2K(r,g,b);
2189                         v |= (m>>(8-nb))<<d;
2190                         break;
2191                 }
2192                 d += nb;
2193         }
2194 //      print("rgba2img %.8lux = %.*lux\n", rgba, 2*d/8, v);
2195         return v;
2196 }
2197
2198 #define DBG if(0)
2199 static int
2200 memoptdraw(Memdrawparam *par)
2201 {
2202         int m, y, dy, dx, op;
2203         ulong v;
2204         Memimage *src;
2205         Memimage *dst;
2206
2207         dx = Dx(par->r);
2208         dy = Dy(par->r);
2209         src = par->src;
2210         dst = par->dst;
2211         op = par->op;
2212
2213 DBG print("state %lux mval %lux dd %d\n", par->state, par->mval, dst->depth);
2214         /*
2215          * If we have an opaque mask and source is one opaque pixel we can convert to the
2216          * destination format and just replicate with memset.
2217          */
2218         m = Simplesrc|Simplemask|Fullmask;
2219         if((par->state&m)==m && (par->srgba&0xFF) == 0xFF && (op ==S || op == SoverD)){
2220                 int d, dwid, ppb, np, nb;
2221                 uchar *dp, lm, rm;
2222
2223 DBG print("memopt, dst %p, dst->data->bdata %p\n", dst, dst->data->bdata);
2224                 dwid = dst->width*sizeof(ulong);
2225                 dp = byteaddr(dst, par->r.min);
2226                 v = par->sdval;
2227 DBG print("sdval %lud, depth %d\n", v, dst->depth);
2228                 switch(dst->depth){
2229                 case 1:
2230                 case 2:
2231                 case 4:
2232                         for(d=dst->depth; d<8; d*=2)
2233                                 v |= (v<<d);
2234                         ppb = 8/dst->depth;     /* pixels per byte */
2235                         m = ppb-1;
2236                         /* left edge */
2237                         np = par->r.min.x&m;            /* no. pixels unused on left side of word */
2238                         dx -= (ppb-np);
2239                         nb = 8 - np * dst->depth;               /* no. bits used on right side of word */
2240                         lm = (1<<nb)-1;
2241 DBG print("np %d x %d nb %d lm %ux ppb %d m %ux\n", np, par->r.min.x, nb, lm, ppb, m);
2242
2243                         /* right edge */
2244                         np = par->r.max.x&m;    /* no. pixels used on left side of word */
2245                         dx -= np;
2246                         nb = 8 - np * dst->depth;               /* no. bits unused on right side of word */
2247                         rm = ~((1<<nb)-1);
2248 DBG print("np %d x %d nb %d rm %ux ppb %d m %ux\n", np, par->r.max.x, nb, rm, ppb, m);
2249
2250 DBG print("dx %d Dx %d\n", dx, Dx(par->r));
2251                         /* lm, rm are masks that are 1 where we should touch the bits */
2252                         if(dx < 0){     /* just one byte */
2253                                 lm &= rm;
2254                                 for(y=0; y<dy; y++, dp+=dwid)
2255                                         *dp ^= (v ^ *dp) & lm;
2256                         }else if(dx == 0){      /* no full bytes */
2257                                 if(lm)
2258                                         dwid--;
2259
2260                                 for(y=0; y<dy; y++, dp+=dwid){
2261                                         if(lm){
2262 DBG print("dp %p v %lux lm %ux (v ^ *dp) & lm %lux\n", dp, v, lm, (v^*dp)&lm);
2263                                                 *dp ^= (v ^ *dp) & lm;
2264                                                 dp++;
2265                                         }
2266                                         *dp ^= (v ^ *dp) & rm;
2267                                 }
2268                         }else{          /* full bytes in middle */
2269                                 dx /= ppb;
2270                                 if(lm)
2271                                         dwid--;
2272                                 dwid -= dx;
2273
2274                                 for(y=0; y<dy; y++, dp+=dwid){
2275                                         if(lm){
2276                                                 *dp ^= (v ^ *dp) & lm;
2277                                                 dp++;
2278                                         }
2279                                         memset(dp, v, dx);
2280                                         dp += dx;
2281                                         *dp ^= (v ^ *dp) & rm;
2282                                 }
2283                         }
2284                         return 1;
2285                 case 8:
2286                         for(y=0; y<dy; y++, dp+=dwid)
2287                                 memset(dp, v, dx);
2288                         return 1;
2289                 case 16:
2290                         for(y=0; y<dy; y++, dp+=dwid)
2291                                 memsets(dp, v, dx);
2292                         return 1;
2293                 case 24:
2294                         for(y=0; y<dy; y++, dp+=dwid)
2295                                 memset24(dp, v, dx);
2296                         return 1;
2297                 case 32:
2298                         for(y=0; y<dy; y++, dp+=dwid)
2299                                 memsetl(dp, v, dx);
2300                         return 1;
2301                 default:
2302                         assert(0 /* bad dest depth in memoptdraw */);
2303                 }
2304         }
2305
2306         /*
2307          * If no source alpha, an opaque mask, we can just copy the
2308          * source onto the destination.  If the channels are the same and
2309          * the source is not replicated, memmove suffices.
2310          */
2311         m = Simplemask|Fullmask;
2312         if((par->state&(m|Replsrc))==m && src->depth >= 8
2313         && src->chan == dst->chan && !(src->flags&Falpha) && (op == S || op == SoverD)){
2314                 uchar *sp, *dp;
2315                 long swid, dwid, nb;
2316                 int dir;
2317
2318                 if(src->data == dst->data && byteaddr(dst, par->r.min) > byteaddr(src, par->sr.min))
2319                         dir = -1;
2320                 else
2321                         dir = 1;
2322
2323                 swid = src->width*sizeof(ulong);
2324                 dwid = dst->width*sizeof(ulong);
2325                 sp = byteaddr(src, par->sr.min);
2326                 dp = byteaddr(dst, par->r.min);
2327                 if(dir == -1){
2328                         sp += (dy-1)*swid;
2329                         dp += (dy-1)*dwid;
2330                         swid = -swid;
2331                         dwid = -dwid;
2332                 }
2333                 nb = (dx*src->depth)/8;
2334                 for(y=0; y<dy; y++, sp+=swid, dp+=dwid)
2335                         memmove(dp, sp, nb);
2336                 return 1;
2337         }
2338
2339         /*
2340          * If we have a 1-bit mask, 1-bit source, and 1-bit destination, and
2341          * they're all bit aligned, we can just use bit operators.  This happens
2342          * when we're manipulating boolean masks, e.g. in the arc code.
2343          */
2344         if((par->state&(Simplemask|Simplesrc|Replmask|Replsrc))==0
2345         && dst->chan==GREY1 && src->chan==GREY1 && par->mask->chan==GREY1
2346         && (par->r.min.x&7)==(par->sr.min.x&7) && (par->r.min.x&7)==(par->mr.min.x&7)){
2347                 uchar *sp, *dp, *mp;
2348                 uchar lm, rm;
2349                 long swid, dwid, mwid;
2350                 int i, x, dir;
2351
2352                 sp = byteaddr(src, par->sr.min);
2353                 dp = byteaddr(dst, par->r.min);
2354                 mp = byteaddr(par->mask, par->mr.min);
2355                 swid = src->width*sizeof(ulong);
2356                 dwid = dst->width*sizeof(ulong);
2357                 mwid = par->mask->width*sizeof(ulong);
2358
2359                 if(src->data == dst->data && byteaddr(dst, par->r.min) > byteaddr(src, par->sr.min)){
2360                         dir = -1;
2361                 }else
2362                         dir = 1;
2363
2364                 lm = 0xFF>>(par->r.min.x&7);
2365                 rm = 0xFF<<(8-(par->r.max.x&7));
2366                 dx -= (8-(par->r.min.x&7)) + (par->r.max.x&7);
2367
2368                 if(dx < 0){     /* one byte wide */
2369                         lm &= rm;
2370                         if(dir == -1){
2371                                 dp += dwid*(dy-1);
2372                                 sp += swid*(dy-1);
2373                                 mp += mwid*(dy-1);
2374                                 dwid = -dwid;
2375                                 swid = -swid;
2376                                 mwid = -mwid;
2377                         }
2378                         for(y=0; y<dy; y++){
2379                                 *dp ^= (*dp ^ *sp) & *mp & lm;
2380                                 dp += dwid;
2381                                 sp += swid;
2382                                 mp += mwid;
2383                         }
2384                         return 1;
2385                 }
2386
2387                 dx /= 8;
2388                 if(dir == 1){
2389                         i = (lm!=0)+dx+(rm!=0);
2390                         mwid -= i;
2391                         swid -= i;
2392                         dwid -= i;
2393                         for(y=0; y<dy; y++, dp+=dwid, sp+=swid, mp+=mwid){
2394                                 if(lm){
2395                                         *dp ^= (*dp ^ *sp++) & *mp++ & lm;
2396                                         dp++;
2397                                 }
2398                                 for(x=0; x<dx; x++){
2399                                         *dp ^= (*dp ^ *sp++) & *mp++;
2400                                         dp++;
2401                                 }
2402                                 if(rm){
2403                                         *dp ^= (*dp ^ *sp++) & *mp++ & rm;
2404                                         dp++;
2405                                 }
2406                         }
2407                         return 1;
2408                 }else{
2409                 /* dir == -1 */
2410                         i = (lm!=0)+dx+(rm!=0);
2411                         dp += dwid*(dy-1)+i-1;
2412                         sp += swid*(dy-1)+i-1;
2413                         mp += mwid*(dy-1)+i-1;
2414                         dwid = -dwid+i;
2415                         swid = -swid+i;
2416                         mwid = -mwid+i;
2417                         for(y=0; y<dy; y++, dp+=dwid, sp+=swid, mp+=mwid){
2418                                 if(rm){
2419                                         *dp ^= (*dp ^ *sp--) & *mp-- & rm;
2420                                         dp--;
2421                                 }
2422                                 for(x=0; x<dx; x++){
2423                                         *dp ^= (*dp ^ *sp--) & *mp--;
2424                                         dp--;
2425                                 }
2426                                 if(lm){
2427                                         *dp ^= (*dp ^ *sp--) & *mp-- & lm;
2428                                         dp--;
2429                                 }
2430                         }
2431                 }
2432                 return 1;
2433         }
2434         return 0;
2435 }
2436 #undef DBG
2437
2438 /*
2439  * Boolean character drawing.
2440  * Solid opaque color through a 1-bit greyscale mask.
2441  */
2442 #define DBG if(0)
2443 static int
2444 chardraw(Memdrawparam *par)
2445 {
2446         int i, ddepth, dy, dx, x, bx, ex, y, npack, bsh, depth, op;
2447         ulong bits, v, maskwid, dstwid;
2448         uchar *wp, *rp, *q, *wc;
2449         ushort *ws;
2450         ulong *wl;
2451         uchar sp[4];
2452         Rectangle r, mr;
2453         Memimage *mask, *src, *dst;
2454
2455 if(0) if(drawdebug) iprint("chardraw? mf %lux md %d sf %lux dxs %d dys %d dd %d ddat %p sdat %p\n",
2456                 par->mask->flags, par->mask->depth, par->src->flags,
2457                 Dx(par->src->r), Dy(par->src->r), par->dst->depth, par->dst->data, par->src->data);
2458
2459         mask = par->mask;
2460         src = par->src;
2461         dst = par->dst;
2462         r = par->r;
2463         mr = par->mr;
2464         op = par->op;
2465
2466         if((par->state&(Replsrc|Simplesrc|Replmask)) != (Replsrc|Simplesrc)
2467         || mask->depth != 1 || src->flags&Falpha || dst->depth<8 || dst->data==src->data
2468         || op != SoverD)
2469                 return 0;
2470
2471 //if(drawdebug) iprint("chardraw...");
2472
2473         depth = mask->depth;
2474         maskwid = mask->width*sizeof(ulong);
2475         rp = byteaddr(mask, mr.min);
2476         npack = 8/depth;
2477         bsh = (mr.min.x % npack) * depth;
2478
2479         wp = byteaddr(dst, r.min);
2480         dstwid = dst->width*sizeof(ulong);
2481 DBG print("bsh %d\n", bsh);
2482         dy = Dy(r);
2483         dx = Dx(r);
2484
2485         ddepth = dst->depth;
2486
2487         /*
2488          * for loop counts from bsh to bsh+dx
2489          *
2490          * we want the bottom bits to be the amount
2491          * to shift the pixels down, so for n≡0 (mod 8) we want
2492          * bottom bits 7.  for n≡1, 6, etc.
2493          * the bits come from -n-1.
2494          */
2495
2496         bx = -bsh-1;
2497         ex = -bsh-1-dx;
2498         SET(bits);
2499         v = par->sdval;
2500
2501         /* make little endian */
2502         sp[0] = v;
2503         sp[1] = v>>8;
2504         sp[2] = v>>16;
2505         sp[3] = v>>24;
2506
2507 //print("sp %x %x %x %x\n", sp[0], sp[1], sp[2], sp[3]);
2508         for(y=0; y<dy; y++, rp+=maskwid, wp+=dstwid){
2509                 q = rp;
2510                 if(bsh)
2511                         bits = *q++;
2512                 switch(ddepth){
2513                 case 8:
2514 //if(drawdebug) iprint("8loop...");
2515                         wc = wp;
2516                         for(x=bx; x>ex; x--, wc++){
2517                                 i = x&7;
2518                                 if(i == 8-1)
2519                                         bits = *q++;
2520 DBG print("bits %lux sh %d...", bits, i);
2521                                 if((bits>>i)&1)
2522                                         *wc = v;
2523                         }
2524                         break;
2525                 case 16:
2526                         ws = (ushort*)wp;
2527                         v = *(ushort*)sp;
2528                         for(x=bx; x>ex; x--, ws++){
2529                                 i = x&7;
2530                                 if(i == 8-1)
2531                                         bits = *q++;
2532 DBG print("bits %lux sh %d...", bits, i);
2533                                 if((bits>>i)&1)
2534                                         *ws = v;
2535                         }
2536                         break;
2537                 case 24:
2538                         wc = wp;
2539                         for(x=bx; x>ex; x--, wc+=3){
2540                                 i = x&7;
2541                                 if(i == 8-1)
2542                                         bits = *q++;
2543 DBG print("bits %lux sh %d...", bits, i);
2544                                 if((bits>>i)&1){
2545                                         wc[0] = sp[0];
2546                                         wc[1] = sp[1];
2547                                         wc[2] = sp[2];
2548                                 }
2549                         }
2550                         break;
2551                 case 32:
2552                         wl = (ulong*)wp;
2553                         v = *(ulong*)sp;
2554                         for(x=bx; x>ex; x--, wl++){
2555                                 i = x&7;
2556                                 if(i == 8-1)
2557                                         bits = *q++;
2558 DBG iprint("bits %lux sh %d...", bits, i);
2559                                 if((bits>>i)&1)
2560                                         *wl = v;
2561                         }
2562                         break;
2563                 }
2564         }
2565
2566 DBG print("\n");
2567         return 1;
2568 }
2569 #undef DBG
2570
2571
2572 /*
2573  * Fill entire byte with replicated (if necessary) copy of source pixel,
2574  * assuming destination ldepth is >= source ldepth.
2575  *
2576  * This code is just plain wrong for >8bpp.
2577  *
2578 ulong
2579 membyteval(Memimage *src)
2580 {
2581         int i, val, bpp;
2582         uchar uc;
2583
2584         unloadmemimage(src, src->r, &uc, 1);
2585         bpp = src->depth;
2586         uc <<= (src->r.min.x&(7/src->depth))*src->depth;
2587         uc &= ~(0xFF>>bpp);
2588         /* pixel value is now in high part of byte. repeat throughout byte
2589         val = uc;
2590         for(i=bpp; i<8; i<<=1)
2591                 val |= val>>i;
2592         return val;
2593 }
2594  *
2595  */
2596
2597 void
2598 memfillcolor(Memimage *i, ulong val)
2599 {
2600         ulong bits;
2601         int d, y;
2602
2603         if(val == DNofill)
2604                 return;
2605
2606         bits = rgbatoimg(i, val);
2607         switch(i->depth){
2608         case 24:        /* 24-bit images suck */
2609                 for(y=i->r.min.y; y<i->r.max.y; y++)
2610                         memset24(byteaddr(i, Pt(i->r.min.x, y)), bits, Dx(i->r));
2611                 break;
2612         default:        /* 1, 2, 4, 8, 16, 32 */
2613                 for(d=i->depth; d<32; d*=2)
2614                         bits = (bits << d) | bits;
2615                 memsetl(wordaddr(i, i->r.min), bits, i->width*Dy(i->r));
2616                 break;
2617         }
2618 }
2619