root/ext/ereg/regex/regcomp.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. API_EXPORT
  2. p_ere
  3. p_ere_exp
  4. p_str
  5. p_bre
  6. p_simp_re
  7. p_count
  8. p_bracket
  9. p_b_term
  10. p_b_cclass
  11. p_b_eclass
  12. p_b_symbol
  13. p_b_coll_elem
  14. othercase
  15. bothcases
  16. ordinary
  17. nonnewline
  18. repeat
  19. seterr
  20. allocset
  21. freeset
  22. freezeset
  23. firstch
  24. nch
  25. mcadd
  26. mcsub
  27. mcin
  28. mcfind
  29. mcinvert
  30. mccase
  31. isinsets
  32. samesets
  33. categorize
  34. dupl
  35. doemit
  36. doinsert
  37. dofwd
  38. enlarge
  39. stripsnug
  40. findmust
  41. pluscount

   1 #include <sys/types.h>
   2 #include <stdio.h>
   3 #include <string.h>
   4 #include <ctype.h>
   5 #include <limits.h>
   6 #include <stdlib.h>
   7 
   8 #define POSIX_MISTAKE
   9 
  10 #include "utils.h"
  11 #include "regex.h"
  12 #include "regex2.h"
  13 
  14 #include "cclass.h"
  15 #include "cname.h"
  16 
  17 /*
  18  * parse structure, passed up and down to avoid global variables and
  19  * other clumsinesses
  20  */
  21 struct parse {
  22         unsigned char *next;            /* next character in RE */
  23         unsigned char *end;             /* end of string (-> NUL normally) */
  24         int error;              /* has an error been seen? */
  25         sop *strip;             /* malloced strip */
  26         sopno ssize;            /* malloced strip size (allocated) */
  27         sopno slen;             /* malloced strip length (used) */
  28         int ncsalloc;           /* number of csets allocated */
  29         struct re_guts *g;
  30 #       define  NPAREN  10      /* we need to remember () 1-9 for back refs */
  31         sopno pbegin[NPAREN];   /* -> ( ([0] unused) */
  32         sopno pend[NPAREN];     /* -> ) ([0] unused) */
  33 };
  34 
  35 #include "regcomp.ih"
  36 
  37 static unsigned char nuls[10];          /* place to point scanner in event of error */
  38 
  39 /*
  40  * macros for use with parse structure
  41  * BEWARE:  these know that the parse structure is named `p' !!!
  42  */
  43 #define PEEK()  (*p->next)
  44 #define PEEK2() (*(p->next+1))
  45 #define MORE()  (p->next < p->end)
  46 #define MORE2() (p->next+1 < p->end)
  47 #define SEE(c)  (MORE() && PEEK() == (c))
  48 #define SEETWO(a, b)    (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
  49 #define EAT(c)  ((SEE(c)) ? (NEXT(), 1) : 0)
  50 #define EATTWO(a, b)    ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
  51 #define NEXT()  (p->next++)
  52 #define NEXT2() (p->next += 2)
  53 #define NEXTn(n)        (p->next += (n))
  54 #define GETNEXT()       (*p->next++)
  55 #define SETERROR(e)     seterr(p, (e))
  56 #define REQUIRE(co, e)  (void) ((co) || SETERROR(e))
  57 #define MUSTSEE(c, e)   (REQUIRE(MORE() && PEEK() == (c), e))
  58 #define MUSTEAT(c, e)   (REQUIRE(MORE() && GETNEXT() == (c), e))
  59 #define MUSTNOTSEE(c, e)        (REQUIRE(!MORE() || PEEK() != (c), e))
  60 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
  61 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
  62 #define AHEAD(pos)              dofwd(p, pos, HERE()-(pos))
  63 #define ASTERN(sop, pos)        EMIT(sop, HERE()-pos)
  64 #define HERE()          (p->slen)
  65 #define THERE()         (p->slen - 1)
  66 #define THERETHERE()    (p->slen - 2)
  67 #define DROP(n) (p->slen -= (n))
  68 
  69 #ifndef NDEBUG
  70 static int never = 0;           /* for use in asserts; shuts lint up */
  71 #else
  72 #define never   0               /* some <assert.h>s have bugs too */
  73 #endif
  74 
  75 /*
  76  - regcomp - interface for parser and compilation
  77  = API_EXPORT(int) regcomp(regex_t *, const char *, int);
  78  = #define      REG_BASIC       0000
  79  = #define      REG_EXTENDED    0001
  80  = #define      REG_ICASE       0002
  81  = #define      REG_NOSUB       0004
  82  = #define      REG_NEWLINE     0010
  83  = #define      REG_NOSPEC      0020
  84  = #define      REG_PEND        0040
  85  = #define      REG_DUMP        0200
  86  */
  87 API_EXPORT(int)                 /* 0 success, otherwise REG_something */
  88 regcomp(preg, pattern, cflags)
  89 regex_t *preg;
  90 const char *pattern;
  91 int cflags;
  92 {
  93         struct parse pa;
  94         register struct re_guts *g;
  95         register struct parse *p = &pa;
  96         register int i;
  97         register size_t len;
  98 #ifdef REDEBUG
  99 #       define  GOODFLAGS(f)    (f)
 100 #else
 101 #       define  GOODFLAGS(f)    ((f)&~REG_DUMP)
 102 #endif
 103 
 104         cflags = GOODFLAGS(cflags);
 105         if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
 106                 return(REG_INVARG);
 107 
 108         if (cflags&REG_PEND) {
 109                 if (preg->re_endp < pattern)
 110                         return(REG_INVARG);
 111                 len = preg->re_endp - pattern;
 112         } else
 113                 len = strlen((char *)pattern);
 114 
 115         /* do the mallocs early so failure handling is easy */
 116         g = (struct re_guts *)malloc(sizeof(struct re_guts) +
 117                                                         (NC-1)*sizeof(cat_t));
 118         if (g == NULL)
 119                 return(REG_ESPACE);
 120         {
 121                 /* Patched for CERT Vulnerability Note VU#695940, Feb 2015. */
 122                 size_t new_ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
 123                 if (new_ssize < len || new_ssize > LONG_MAX / sizeof(sop)) {
 124                         free((char *) g);
 125                         return REG_INVARG;
 126                 }
 127                 p->ssize = new_ssize;
 128         }
 129         p->strip = (sop *)malloc(p->ssize * sizeof(sop));
 130         p->slen = 0;
 131         if (p->strip == NULL) {
 132                 free((char *)g);
 133                 return(REG_ESPACE);
 134         }
 135 
 136         /* set things up */
 137         p->g = g;
 138         p->next = (unsigned char *)pattern;     /* convenience; we do not modify it */
 139         p->end = p->next + len;
 140         p->error = 0;
 141         p->ncsalloc = 0;
 142         for (i = 0; i < NPAREN; i++) {
 143                 p->pbegin[i] = 0;
 144                 p->pend[i] = 0;
 145         }
 146         g->csetsize = NC;
 147         g->sets = NULL;
 148         g->setbits = NULL;
 149         g->ncsets = 0;
 150         g->cflags = cflags;
 151         g->iflags = 0;
 152         g->nbol = 0;
 153         g->neol = 0;
 154         g->must = NULL;
 155         g->mlen = 0;
 156         g->nsub = 0;
 157         g->ncategories = 1;     /* category 0 is "everything else" */
 158         g->categories = &g->catspace[0];
 159         (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
 160         g->backrefs = 0;
 161 
 162         /* do it */
 163         EMIT(OEND, 0);
 164         g->firststate = THERE();
 165         if (cflags&REG_EXTENDED)
 166                 p_ere(p, OUT);
 167         else if (cflags&REG_NOSPEC)
 168                 p_str(p);
 169         else
 170                 p_bre(p, OUT, OUT);
 171         EMIT(OEND, 0);
 172         g->laststate = THERE();
 173 
 174         /* tidy up loose ends and fill things in */
 175         categorize(p, g);
 176         stripsnug(p, g);
 177         findmust(p, g);
 178         g->nplus = pluscount(p, g);
 179         g->magic = MAGIC2;
 180         preg->re_nsub = g->nsub;
 181         preg->re_g = g;
 182         preg->re_magic = MAGIC1;
 183 #ifndef REDEBUG
 184         /* not debugging, so can't rely on the assert() in regexec() */
 185         if (g->iflags&BAD)
 186                 SETERROR(REG_ASSERT);
 187 #endif
 188 
 189         /* win or lose, we're done */
 190         if (p->error != 0)      /* lose */
 191                 regfree(preg);
 192         return(p->error);
 193 }
 194 
 195 /*
 196  - p_ere - ERE parser top level, concatenation and alternation
 197  == static void p_ere(register struct parse *p, int stop);
 198  */
 199 static void
 200 p_ere(p, stop)
 201 register struct parse *p;
 202 int stop;                       /* character this ERE should end at */
 203 {
 204         register unsigned char c;
 205         register sopno prevback = 0;
 206         register sopno prevfwd = 0;
 207         register sopno conc;
 208         register int first = 1;         /* is this the first alternative? */
 209 
 210         for (;;) {
 211                 /* do a bunch of concatenated expressions */
 212                 conc = HERE();
 213                 while (MORE() && (c = PEEK()) != '|' && c != stop)
 214                         p_ere_exp(p);
 215                 (void) REQUIRE(HERE() != conc, REG_EMPTY);      /* require nonempty */
 216 
 217                 if (!EAT('|'))
 218                         break;          /* NOTE BREAK OUT */
 219 
 220                 if (first) {
 221                         INSERT(OCH_, conc);     /* offset is wrong */
 222                         prevfwd = conc;
 223                         prevback = conc;
 224                         first = 0;
 225                 }
 226                 ASTERN(OOR1, prevback);
 227                 prevback = THERE();
 228                 AHEAD(prevfwd);                 /* fix previous offset */
 229                 prevfwd = HERE();
 230                 EMIT(OOR2, 0);                  /* offset is very wrong */
 231         }
 232 
 233         if (!first) {           /* tail-end fixups */
 234                 AHEAD(prevfwd);
 235                 ASTERN(O_CH, prevback);
 236         }
 237 
 238         assert(!MORE() || SEE(stop));
 239 }
 240 
 241 /*
 242  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
 243  == static void p_ere_exp(register struct parse *p);
 244  */
 245 static void
 246 p_ere_exp(p)
 247 register struct parse *p;
 248 {
 249         register unsigned char c;
 250         register sopno pos;
 251         register int count;
 252         register int count2;
 253         register sopno subno;
 254         int wascaret = 0;
 255 
 256         assert(MORE());         /* caller should have ensured this */
 257         c = GETNEXT();
 258 
 259         pos = HERE();
 260         switch (c) {
 261         case '(':
 262                 REQUIRE(MORE(), REG_EPAREN);
 263                 p->g->nsub++;
 264                 subno = p->g->nsub;
 265                 if (subno < NPAREN)
 266                         p->pbegin[subno] = HERE();
 267                 EMIT(OLPAREN, subno);
 268                 if (!SEE(')'))
 269                         p_ere(p, ')');
 270                 if (subno < NPAREN) {
 271                         p->pend[subno] = HERE();
 272                         assert(p->pend[subno] != 0);
 273                 }
 274                 EMIT(ORPAREN, subno);
 275                 MUSTEAT(')', REG_EPAREN);
 276                 break;
 277 #ifndef POSIX_MISTAKE
 278         case ')':               /* happens only if no current unmatched ( */
 279                 /*
 280                  * You may ask, why the ifndef?  Because I didn't notice
 281                  * this until slightly too late for 1003.2, and none of the
 282                  * other 1003.2 regular-expression reviewers noticed it at
 283                  * all.  So an unmatched ) is legal POSIX, at least until
 284                  * we can get it fixed.
 285                  */
 286                 SETERROR(REG_EPAREN);
 287                 break;
 288 #endif
 289         case '^':
 290                 EMIT(OBOL, 0);
 291                 p->g->iflags |= USEBOL;
 292                 p->g->nbol++;
 293                 wascaret = 1;
 294                 break;
 295         case '$':
 296                 EMIT(OEOL, 0);
 297                 p->g->iflags |= USEEOL;
 298                 p->g->neol++;
 299                 break;
 300         case '|':
 301                 SETERROR(REG_EMPTY);
 302                 break;
 303         case '*':
 304         case '+':
 305         case '?':
 306                 SETERROR(REG_BADRPT);
 307                 break;
 308         case '.':
 309                 if (p->g->cflags&REG_NEWLINE)
 310                         nonnewline(p);
 311                 else
 312                         EMIT(OANY, 0);
 313                 break;
 314         case '[':
 315                 p_bracket(p);
 316                 break;
 317         case '\\':
 318                 REQUIRE(MORE(), REG_EESCAPE);
 319                 c = GETNEXT();
 320                 ordinary(p, c);
 321                 break;
 322         case '{':               /* okay as ordinary except if digit follows */
 323                 REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
 324                 /* FALLTHROUGH */
 325         default:
 326                 ordinary(p, c);
 327                 break;
 328         }
 329 
 330         if (!MORE())
 331                 return;
 332         c = PEEK();
 333         /* we call { a repetition if followed by a digit */
 334         if (!( c == '*' || c == '+' || c == '?' ||
 335                                 (c == '{' && MORE2() && isdigit(PEEK2())) ))
 336                 return;         /* no repetition, we're done */
 337         NEXT();
 338 
 339         REQUIRE(!wascaret, REG_BADRPT);
 340         switch (c) {
 341         case '*':       /* implemented as +? */
 342                 /* this case does not require the (y|) trick, noKLUDGE */
 343                 INSERT(OPLUS_, pos);
 344                 ASTERN(O_PLUS, pos);
 345                 INSERT(OQUEST_, pos);
 346                 ASTERN(O_QUEST, pos);
 347                 break;
 348         case '+':
 349                 INSERT(OPLUS_, pos);
 350                 ASTERN(O_PLUS, pos);
 351                 break;
 352         case '?':
 353                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 354                 INSERT(OCH_, pos);              /* offset slightly wrong */
 355                 ASTERN(OOR1, pos);              /* this one's right */
 356                 AHEAD(pos);                     /* fix the OCH_ */
 357                 EMIT(OOR2, 0);                  /* offset very wrong... */
 358                 AHEAD(THERE());                 /* ...so fix it */
 359                 ASTERN(O_CH, THERETHERE());
 360                 break;
 361         case '{':
 362                 count = p_count(p);
 363                 if (EAT(',')) {
 364                         if (isdigit(PEEK())) {
 365                                 count2 = p_count(p);
 366                                 REQUIRE(count <= count2, REG_BADBR);
 367                         } else          /* single number with comma */
 368                                 count2 = INFINITY;
 369                 } else          /* just a single number */
 370                         count2 = count;
 371                 repeat(p, pos, count, count2);
 372                 if (!EAT('}')) {        /* error heuristics */
 373                         while (MORE() && PEEK() != '}')
 374                                 NEXT();
 375                         REQUIRE(MORE(), REG_EBRACE);
 376                         SETERROR(REG_BADBR);
 377                 }
 378                 break;
 379         }
 380 
 381         if (!MORE())
 382                 return;
 383         c = PEEK();
 384         if (!( c == '*' || c == '+' || c == '?' ||
 385                                 (c == '{' && MORE2() && isdigit(PEEK2())) ) )
 386                 return;
 387         SETERROR(REG_BADRPT);
 388 }
 389 
 390 /*
 391  - p_str - string (no metacharacters) "parser"
 392  == static void p_str(register struct parse *p);
 393  */
 394 static void
 395 p_str(p)
 396 register struct parse *p;
 397 {
 398         REQUIRE(MORE(), REG_EMPTY);
 399         while (MORE())
 400                 ordinary(p, GETNEXT());
 401 }
 402 
 403 /*
 404  - p_bre - BRE parser top level, anchoring and concatenation
 405  == static void p_bre(register struct parse *p, register int end1, \
 406  ==     register int end2);
 407  * Giving end1 as OUT essentially eliminates the end1/end2 check.
 408  *
 409  * This implementation is a bit of a kludge, in that a trailing $ is first
 410  * taken as an ordinary character and then revised to be an anchor.  The
 411  * only undesirable side effect is that '$' gets included as a character
 412  * category in such cases.  This is fairly harmless; not worth fixing.
 413  * The amount of lookahead needed to avoid this kludge is excessive.
 414  */
 415 static void
 416 p_bre(p, end1, end2)
 417 register struct parse *p;
 418 register int end1;              /* first terminating character */
 419 register int end2;              /* second terminating character */
 420 {
 421         register sopno start = HERE();
 422         register int first = 1;                 /* first subexpression? */
 423         register int wasdollar = 0;
 424 
 425         if (EAT('^')) {
 426                 EMIT(OBOL, 0);
 427                 p->g->iflags |= USEBOL;
 428                 p->g->nbol++;
 429         }
 430         while (MORE() && !SEETWO(end1, end2)) {
 431                 wasdollar = p_simp_re(p, first);
 432                 first = 0;
 433         }
 434         if (wasdollar) {        /* oops, that was a trailing anchor */
 435                 DROP(1);
 436                 EMIT(OEOL, 0);
 437                 p->g->iflags |= USEEOL;
 438                 p->g->neol++;
 439         }
 440 
 441         REQUIRE(HERE() != start, REG_EMPTY);    /* require nonempty */
 442 }
 443 
 444 /*
 445  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
 446  == static int p_simp_re(register struct parse *p, int starordinary);
 447  */
 448 static int                      /* was the simple RE an unbackslashed $? */
 449 p_simp_re(p, starordinary)
 450 register struct parse *p;
 451 int starordinary;               /* is a leading * an ordinary character? */
 452 {
 453         register int c;
 454         register int count;
 455         register int count2;
 456         register sopno pos;
 457         register int i;
 458         register sopno subno;
 459 #       define  BACKSL  (1<<CHAR_BIT)
 460 
 461         pos = HERE();           /* repetion op, if any, covers from here */
 462 
 463         assert(MORE());         /* caller should have ensured this */
 464         c = GETNEXT();
 465         if (c == '\\') {
 466                 REQUIRE(MORE(), REG_EESCAPE);
 467                 c = BACKSL | (unsigned char)GETNEXT();
 468         }
 469         switch (c) {
 470         case '.':
 471                 if (p->g->cflags&REG_NEWLINE)
 472                         nonnewline(p);
 473                 else
 474                         EMIT(OANY, 0);
 475                 break;
 476         case '[':
 477                 p_bracket(p);
 478                 break;
 479         case BACKSL|'{':
 480                 SETERROR(REG_BADRPT);
 481                 break;
 482         case BACKSL|'(':
 483                 p->g->nsub++;
 484                 subno = p->g->nsub;
 485                 if (subno < NPAREN)
 486                         p->pbegin[subno] = HERE();
 487                 EMIT(OLPAREN, subno);
 488                 /* the MORE here is an error heuristic */
 489                 if (MORE() && !SEETWO('\\', ')'))
 490                         p_bre(p, '\\', ')');
 491                 if (subno < NPAREN) {
 492                         p->pend[subno] = HERE();
 493                         assert(p->pend[subno] != 0);
 494                 }
 495                 EMIT(ORPAREN, subno);
 496                 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
 497                 break;
 498         case BACKSL|')':        /* should not get here -- must be user */
 499         case BACKSL|'}':
 500                 SETERROR(REG_EPAREN);
 501                 break;
 502         case BACKSL|'1':
 503         case BACKSL|'2':
 504         case BACKSL|'3':
 505         case BACKSL|'4':
 506         case BACKSL|'5':
 507         case BACKSL|'6':
 508         case BACKSL|'7':
 509         case BACKSL|'8':
 510         case BACKSL|'9':
 511                 i = (c&~BACKSL) - '0';
 512                 assert(i < NPAREN);
 513                 if (p->pend[i] != 0) {
 514                         assert(i <= p->g->nsub);
 515                         EMIT(OBACK_, i);
 516                         assert(p->pbegin[i] != 0);
 517                         assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
 518                         assert(OP(p->strip[p->pend[i]]) == ORPAREN);
 519                         (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
 520                         EMIT(O_BACK, i);
 521                 } else
 522                         SETERROR(REG_ESUBREG);
 523                 p->g->backrefs = 1;
 524                 break;
 525         case '*':
 526                 REQUIRE(starordinary, REG_BADRPT);
 527                 /* FALLTHROUGH */
 528         default:
 529                 ordinary(p, (unsigned char)c);  /* takes off BACKSL, if any */
 530                 break;
 531         }
 532 
 533         if (EAT('*')) {         /* implemented as +? */
 534                 /* this case does not require the (y|) trick, noKLUDGE */
 535                 INSERT(OPLUS_, pos);
 536                 ASTERN(O_PLUS, pos);
 537                 INSERT(OQUEST_, pos);
 538                 ASTERN(O_QUEST, pos);
 539         } else if (EATTWO('\\', '{')) {
 540                 count = p_count(p);
 541                 if (EAT(',')) {
 542                         if (MORE() && isdigit(PEEK())) {
 543                                 count2 = p_count(p);
 544                                 REQUIRE(count <= count2, REG_BADBR);
 545                         } else          /* single number with comma */
 546                                 count2 = INFINITY;
 547                 } else          /* just a single number */
 548                         count2 = count;
 549                 repeat(p, pos, count, count2);
 550                 if (!EATTWO('\\', '}')) {       /* error heuristics */
 551                         while (MORE() && !SEETWO('\\', '}'))
 552                                 NEXT();
 553                         REQUIRE(MORE(), REG_EBRACE);
 554                         SETERROR(REG_BADBR);
 555                 }
 556         } else if (c == (unsigned char)'$')     /* $ (but not \$) ends it */
 557                 return(1);
 558 
 559         return(0);
 560 }
 561 
 562 /*
 563  - p_count - parse a repetition count
 564  == static int p_count(register struct parse *p);
 565  */
 566 static int                      /* the value */
 567 p_count(p)
 568 register struct parse *p;
 569 {
 570         register int count = 0;
 571         register int ndigits = 0;
 572 
 573         while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
 574                 count = count*10 + (GETNEXT() - '0');
 575                 ndigits++;
 576         }
 577 
 578         REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
 579         return(count);
 580 }
 581 
 582 /*
 583  - p_bracket - parse a bracketed character list
 584  == static void p_bracket(register struct parse *p);
 585  *
 586  * Note a significant property of this code:  if the allocset() did SETERROR,
 587  * no set operations are done.
 588  */
 589 static void
 590 p_bracket(p)
 591 register struct parse *p;
 592 {
 593         register cset *cs = allocset(p);
 594         register int invert = 0;
 595 
 596         /* Dept of Truly Sickening Special-Case Kludges */
 597         if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
 598                 EMIT(OBOW, 0);
 599                 NEXTn(6);
 600                 return;
 601         }
 602         if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
 603                 EMIT(OEOW, 0);
 604                 NEXTn(6);
 605                 return;
 606         }
 607 
 608         if (EAT('^'))
 609                 invert++;       /* make note to invert set at end */
 610         if (EAT(']'))
 611                 CHadd(cs, ']');
 612         else if (EAT('-'))
 613                 CHadd(cs, '-');
 614         while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
 615                 p_b_term(p, cs);
 616         if (EAT('-'))
 617                 CHadd(cs, '-');
 618         MUSTEAT(']', REG_EBRACK);
 619 
 620         if (p->error != 0)      /* don't mess things up further */
 621                 return;
 622 
 623         if (p->g->cflags&REG_ICASE) {
 624                 register int i;
 625                 register int ci;
 626 
 627                 for (i = p->g->csetsize - 1; i >= 0; i--)
 628                         if (CHIN(cs, i) && isalpha(i)) {
 629                                 ci = othercase(i);
 630                                 if (ci != i)
 631                                         CHadd(cs, ci);
 632                         }
 633                 if (cs->multis != NULL)
 634                         mccase(p, cs);
 635         }
 636         if (invert) {
 637                 register int i;
 638 
 639                 for (i = p->g->csetsize - 1; i >= 0; i--)
 640                         if (CHIN(cs, i))
 641                                 CHsub(cs, i);
 642                         else
 643                                 CHadd(cs, i);
 644                 if (p->g->cflags&REG_NEWLINE)
 645                         CHsub(cs, '\n');
 646                 if (cs->multis != NULL)
 647                         mcinvert(p, cs);
 648         }
 649 
 650         assert(cs->multis == NULL);             /* xxx */
 651 
 652         if (nch(p, cs) == 1) {          /* optimize singleton sets */
 653                 ordinary(p, firstch(p, cs));
 654                 freeset(p, cs);
 655         } else
 656                 EMIT(OANYOF, freezeset(p, cs));
 657 }
 658 
 659 /*
 660  - p_b_term - parse one term of a bracketed character list
 661  == static void p_b_term(register struct parse *p, register cset *cs);
 662  */
 663 static void
 664 p_b_term(p, cs)
 665 register struct parse *p;
 666 register cset *cs;
 667 {
 668         register unsigned char c;
 669         register unsigned char start, finish;
 670         register int i;
 671 
 672         /* classify what we've got */
 673         switch ((MORE()) ? PEEK() : '\0') {
 674         case '[':
 675                 c = (MORE2()) ? PEEK2() : '\0';
 676                 break;
 677         case '-':
 678                 SETERROR(REG_ERANGE);
 679                 return;                 /* NOTE RETURN */
 680                 break;
 681         default:
 682                 c = '\0';
 683                 break;
 684         }
 685 
 686         switch (c) {
 687         case ':':               /* character class */
 688                 NEXT2();
 689                 REQUIRE(MORE(), REG_EBRACK);
 690                 c = PEEK();
 691                 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
 692                 p_b_cclass(p, cs);
 693                 REQUIRE(MORE(), REG_EBRACK);
 694                 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
 695                 break;
 696         case '=':               /* equivalence class */
 697                 NEXT2();
 698                 REQUIRE(MORE(), REG_EBRACK);
 699                 c = PEEK();
 700                 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
 701                 p_b_eclass(p, cs);
 702                 REQUIRE(MORE(), REG_EBRACK);
 703                 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
 704                 break;
 705         default:                /* symbol, ordinary character, or range */
 706 /* xxx revision needed for multichar stuff */
 707                 start = p_b_symbol(p);
 708                 if (SEE('-') && MORE2() && PEEK2() != ']') {
 709                         /* range */
 710                         NEXT();
 711                         if (EAT('-'))
 712                                 finish = '-';
 713                         else
 714                                 finish = p_b_symbol(p);
 715                 } else
 716                         finish = start;
 717 /* xxx what about signed chars here... */
 718                 REQUIRE(start <= finish, REG_ERANGE);
 719                 for (i = start; i <= finish; i++)
 720                         CHadd(cs, i);
 721                 break;
 722         }
 723 }
 724 
 725 /*
 726  - p_b_cclass - parse a character-class name and deal with it
 727  == static void p_b_cclass(register struct parse *p, register cset *cs);
 728  */
 729 static void
 730 p_b_cclass(p, cs)
 731 register struct parse *p;
 732 register cset *cs;
 733 {
 734         register unsigned char *sp = p->next;
 735         register const struct cclass *cp;
 736         register size_t len;
 737         register const unsigned char *u;
 738         register unsigned char c;
 739 
 740         while (MORE() && isalpha(PEEK()))
 741                 NEXT();
 742         len = p->next - sp;
 743         for (cp = cclasses; cp->name != NULL; cp++)
 744                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
 745                         break;
 746         if (cp->name == NULL) {
 747                 /* oops, didn't find it */
 748                 SETERROR(REG_ECTYPE);
 749                 return;
 750         }
 751 
 752         u = cp->chars;
 753         while ((c = *u++) != '\0')
 754                 CHadd(cs, c);
 755         for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
 756                 MCadd(p, cs, u);
 757 }
 758 
 759 /*
 760  - p_b_eclass - parse an equivalence-class name and deal with it
 761  == static void p_b_eclass(register struct parse *p, register cset *cs);
 762  *
 763  * This implementation is incomplete. xxx
 764  */
 765 static void
 766 p_b_eclass(p, cs)
 767 register struct parse *p;
 768 register cset *cs;
 769 {
 770         register unsigned char c;
 771 
 772         c = p_b_coll_elem(p, '=');
 773         CHadd(cs, c);
 774 }
 775 
 776 /*
 777  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
 778  == static char p_b_symbol(register struct parse *p);
 779  */
 780 static unsigned char                    /* value of symbol */
 781 p_b_symbol(p)
 782 register struct parse *p;
 783 {
 784         register unsigned char value;
 785 
 786         REQUIRE(MORE(), REG_EBRACK);
 787         if (!EATTWO('[', '.'))
 788                 return(GETNEXT());
 789 
 790         /* collating symbol */
 791         value = p_b_coll_elem(p, '.');
 792         REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
 793         return(value);
 794 }
 795 
 796 /*
 797  - p_b_coll_elem - parse a collating-element name and look it up
 798  == static char p_b_coll_elem(register struct parse *p, int endc);
 799  */
 800 static unsigned char                    /* value of collating element */
 801 p_b_coll_elem(p, endc)
 802 register struct parse *p;
 803 int endc;                       /* name ended by endc,']' */
 804 {
 805         register unsigned char *sp = p->next;
 806         register const struct cname *cp;
 807         register int len;
 808 
 809         while (MORE() && !SEETWO(endc, ']'))
 810                 NEXT();
 811         if (!MORE()) {
 812                 SETERROR(REG_EBRACK);
 813                 return(0);
 814         }
 815         len = p->next - sp;
 816         for (cp = cnames; cp->name != NULL; cp++)
 817                 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
 818                         return(cp->code);       /* known name */
 819         if (len == 1)
 820                 return(*sp);    /* single character */
 821         SETERROR(REG_ECOLLATE);                 /* neither */
 822         return(0);
 823 }
 824 
 825 /*
 826  - othercase - return the case counterpart of an alphabetic
 827  == static char othercase(int ch);
 828  */
 829 static unsigned char                    /* if no counterpart, return ch */
 830 othercase(ch)
 831 int ch;
 832 {
 833         assert(isalpha(ch));
 834         if (isupper(ch))
 835                 return(tolower(ch));
 836         else if (islower(ch))
 837                 return(toupper(ch));
 838         else                    /* peculiar, but could happen */
 839                 return(ch);
 840 }
 841 
 842 /*
 843  - bothcases - emit a dualcase version of a two-case character
 844  == static void bothcases(register struct parse *p, int ch);
 845  *
 846  * Boy, is this implementation ever a kludge...
 847  */
 848 static void
 849 bothcases(p, ch)
 850 register struct parse *p;
 851 int ch;
 852 {
 853         register unsigned char *oldnext = p->next;
 854         register unsigned char *oldend = p->end;
 855         unsigned char bracket[3];
 856 
 857         assert(othercase(ch) != ch);    /* p_bracket() would recurse */
 858         p->next = bracket;
 859         p->end = bracket+2;
 860         bracket[0] = ch;
 861         bracket[1] = ']';
 862         bracket[2] = '\0';
 863         p_bracket(p);
 864         assert(p->next == bracket+2);
 865         p->next = oldnext;
 866         p->end = oldend;
 867 }
 868 
 869 /*
 870  - ordinary - emit an ordinary character
 871  == static void ordinary(register struct parse *p, register int ch);
 872  */
 873 static void
 874 ordinary(p, ch)
 875 register struct parse *p;
 876 register int ch;
 877 {
 878         register cat_t *cap = p->g->categories;
 879 
 880         if ((p->g->cflags&REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
 881                 bothcases(p, ch);
 882         else {
 883                 EMIT(OCHAR, (unsigned char)ch);
 884                 if (cap[ch] == 0)
 885                         cap[ch] = p->g->ncategories++;
 886         }
 887 }
 888 
 889 /*
 890  - nonnewline - emit REG_NEWLINE version of OANY
 891  == static void nonnewline(register struct parse *p);
 892  *
 893  * Boy, is this implementation ever a kludge...
 894  */
 895 static void
 896 nonnewline(p)
 897 register struct parse *p;
 898 {
 899         register unsigned char *oldnext = p->next;
 900         register unsigned char *oldend = p->end;
 901         unsigned char bracket[4];
 902 
 903         p->next = bracket;
 904         p->end = bracket+3;
 905         bracket[0] = '^';
 906         bracket[1] = '\n';
 907         bracket[2] = ']';
 908         bracket[3] = '\0';
 909         p_bracket(p);
 910         assert(p->next == bracket+3);
 911         p->next = oldnext;
 912         p->end = oldend;
 913 }
 914 
 915 /*
 916  - repeat - generate code for a bounded repetition, recursively if needed
 917  == static void repeat(register struct parse *p, sopno start, int from, int to);
 918  */
 919 static void
 920 repeat(p, start, from, to)
 921 register struct parse *p;
 922 sopno start;                    /* operand from here to end of strip */
 923 int from;                       /* repeated from this number */
 924 int to;                         /* to this number of times (maybe INFINITY) */
 925 {
 926         register sopno finish = HERE();
 927 #       define  N       2
 928 #       define  INF     3
 929 #       define  REP(f, t)       ((f)*8 + (t))
 930 #       define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
 931         register sopno copy;
 932 
 933         if (p->error != 0)      /* head off possible runaway recursion */
 934                 return;
 935 
 936         assert(from <= to);
 937 
 938         switch (REP(MAP(from), MAP(to))) {
 939         case REP(0, 0):                 /* must be user doing this */
 940                 DROP(finish-start);     /* drop the operand */
 941                 break;
 942         case REP(0, 1):                 /* as x{1,1}? */
 943         case REP(0, N):                 /* as x{1,n}? */
 944         case REP(0, INF):               /* as x{1,}? */
 945                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 946                 INSERT(OCH_, start);            /* offset is wrong... */
 947                 repeat(p, start+1, 1, to);
 948                 ASTERN(OOR1, start);
 949                 AHEAD(start);                   /* ... fix it */
 950                 EMIT(OOR2, 0);
 951                 AHEAD(THERE());
 952                 ASTERN(O_CH, THERETHERE());
 953                 break;
 954         case REP(1, 1):                 /* trivial case */
 955                 /* done */
 956                 break;
 957         case REP(1, N):                 /* as x?x{1,n-1} */
 958                 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
 959                 INSERT(OCH_, start);
 960                 ASTERN(OOR1, start);
 961                 AHEAD(start);
 962                 EMIT(OOR2, 0);                  /* offset very wrong... */
 963                 AHEAD(THERE());                 /* ...so fix it */
 964                 ASTERN(O_CH, THERETHERE());
 965                 copy = dupl(p, start+1, finish+1);
 966                 assert(copy == finish+4);
 967                 repeat(p, copy, 1, to-1);
 968                 break;
 969         case REP(1, INF):               /* as x+ */
 970                 INSERT(OPLUS_, start);
 971                 ASTERN(O_PLUS, start);
 972                 break;
 973         case REP(N, N):                 /* as xx{m-1,n-1} */
 974                 copy = dupl(p, start, finish);
 975                 repeat(p, copy, from-1, to-1);
 976                 break;
 977         case REP(N, INF):               /* as xx{n-1,INF} */
 978                 copy = dupl(p, start, finish);
 979                 repeat(p, copy, from-1, to);
 980                 break;
 981         default:                        /* "can't happen" */
 982                 SETERROR(REG_ASSERT);   /* just in case */
 983                 break;
 984         }
 985 }
 986 
 987 /*
 988  - seterr - set an error condition
 989  == static int seterr(register struct parse *p, int e);
 990  */
 991 static int                      /* useless but makes type checking happy */
 992 seterr(p, e)
 993 register struct parse *p;
 994 int e;
 995 {
 996         if (p->error == 0)      /* keep earliest error condition */
 997                 p->error = e;
 998         p->next = nuls;         /* try to bring things to a halt */
 999         p->end = nuls;
1000         return(0);              /* make the return value well-defined */
1001 }
1002 
1003 /*
1004  - allocset - allocate a set of characters for []
1005  == static cset *allocset(register struct parse *p);
1006  */
1007 static cset *
1008 allocset(p)
1009 register struct parse *p;
1010 {
1011         register int no = p->g->ncsets++;
1012         register size_t nc;
1013         register size_t nbytes;
1014         register cset *cs;
1015         register size_t css = (size_t)p->g->csetsize;
1016         register int i;
1017 
1018         if (no >= p->ncsalloc) {        /* need another column of space */
1019                 p->ncsalloc += CHAR_BIT;
1020                 nc = p->ncsalloc;
1021                 assert(nc % CHAR_BIT == 0);
1022                 nbytes = nc / CHAR_BIT * css;
1023                 if (p->g->sets == NULL)
1024                         p->g->sets = (cset *)malloc(nc * sizeof(cset));
1025                 else
1026                         p->g->sets = (cset *)realloc((unsigned char *)p->g->sets,
1027                                                         nc * sizeof(cset));
1028                 if (p->g->setbits == NULL)
1029                         p->g->setbits = (uch *)malloc(nbytes);
1030                 else {
1031                         p->g->setbits = (uch *)realloc((unsigned char *)p->g->setbits,
1032                                                                 nbytes);
1033                         /* xxx this isn't right if setbits is now NULL */
1034                         for (i = 0; i < no; i++)
1035                                 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1036                 }
1037                 if (p->g->sets != NULL && p->g->setbits != NULL)
1038                         (void) memset((unsigned char *)p->g->setbits + (nbytes - css),
1039                                                                 0, css);
1040                 else {
1041                         no = 0;
1042                         SETERROR(REG_ESPACE);
1043                         /* caller's responsibility not to do set ops */
1044                 }
1045         }
1046 
1047         assert(p->g->sets != NULL);     /* xxx */
1048         cs = &p->g->sets[no];
1049         cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1050         cs->mask = 1 << ((no) % CHAR_BIT);
1051         cs->hash = 0;
1052         cs->smultis = 0;
1053         cs->multis = NULL;
1054 
1055         return(cs);
1056 }
1057 
1058 /*
1059  - freeset - free a now-unused set
1060  == static void freeset(register struct parse *p, register cset *cs);
1061  */
1062 static void
1063 freeset(p, cs)
1064 register struct parse *p;
1065 register cset *cs;
1066 {
1067         register size_t i;
1068         register cset *top = &p->g->sets[p->g->ncsets];
1069         register size_t css = (size_t)p->g->csetsize;
1070 
1071         for (i = 0; i < css; i++)
1072                 CHsub(cs, i);
1073         if (cs == top-1)        /* recover only the easy case */
1074                 p->g->ncsets--;
1075 }
1076 
1077 /*
1078  - freezeset - final processing on a set of characters
1079  == static int freezeset(register struct parse *p, register cset *cs);
1080  *
1081  * The main task here is merging identical sets.  This is usually a waste
1082  * of time (although the hash code minimizes the overhead), but can win
1083  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1084  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1085  * the same value!
1086  */
1087 static int                      /* set number */
1088 freezeset(p, cs)
1089 register struct parse *p;
1090 register cset *cs;
1091 {
1092         register uch h = cs->hash;
1093         register size_t i;
1094         register cset *top = &p->g->sets[p->g->ncsets];
1095         register cset *cs2;
1096         register size_t css = (size_t)p->g->csetsize;
1097 
1098         /* look for an earlier one which is the same */
1099         for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1100                 if (cs2->hash == h && cs2 != cs) {
1101                         /* maybe */
1102                         for (i = 0; i < css; i++)
1103                                 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1104                                         break;          /* no */
1105                         if (i == css)
1106                                 break;                  /* yes */
1107                 }
1108 
1109         if (cs2 < top) {        /* found one */
1110                 freeset(p, cs);
1111                 cs = cs2;
1112         }
1113 
1114         return((int)(cs - p->g->sets));
1115 }
1116 
1117 /*
1118  - firstch - return first character in a set (which must have at least one)
1119  == static int firstch(register struct parse *p, register cset *cs);
1120  */
1121 static int                      /* character; there is no "none" value */
1122 firstch(p, cs)
1123 register struct parse *p;
1124 register cset *cs;
1125 {
1126         register size_t i;
1127         register size_t css = (size_t)p->g->csetsize;
1128 
1129         for (i = 0; i < css; i++)
1130                 if (CHIN(cs, i))
1131                         return((unsigned char)i);
1132         assert(never);
1133         return(0);              /* arbitrary */
1134 }
1135 
1136 /*
1137  - nch - number of characters in a set
1138  == static int nch(register struct parse *p, register cset *cs);
1139  */
1140 static int
1141 nch(p, cs)
1142 register struct parse *p;
1143 register cset *cs;
1144 {
1145         register size_t i;
1146         register size_t css = (size_t)p->g->csetsize;
1147         register int n = 0;
1148 
1149         for (i = 0; i < css; i++)
1150                 if (CHIN(cs, i))
1151                         n++;
1152         return(n);
1153 }
1154 
1155 /*
1156  - mcadd - add a collating element to a cset
1157  == static void mcadd(register struct parse *p, register cset *cs, \
1158  ==     register char *cp);
1159  */
1160 static void
1161 mcadd(p, cs, cp)
1162 register struct parse *p;
1163 register cset *cs;
1164 register const unsigned char *cp;
1165 {
1166         register size_t oldend = cs->smultis;
1167 
1168         cs->smultis += strlen(cp) + 1;
1169         if (cs->multis == NULL)
1170                 cs->multis = malloc(cs->smultis);
1171         else
1172                 cs->multis = realloc(cs->multis, cs->smultis);
1173         if (cs->multis == NULL) {
1174                 SETERROR(REG_ESPACE);
1175                 return;
1176         }
1177 
1178         (void) strcpy(cs->multis + oldend - 1, cp);
1179         cs->multis[cs->smultis - 1] = '\0';
1180 }
1181 
1182 #if 0
1183 /*
1184  - mcsub - subtract a collating element from a cset
1185  == static void mcsub(register cset *cs, register unsigned char *cp);
1186  */
1187 static void
1188 mcsub(cs, cp)
1189 register unsigned cset *cs;
1190 register unsigned char *cp;
1191 {
1192         register unsigned char *fp = mcfind(cs, cp);
1193         register size_t len = strlen(fp);
1194 
1195         assert(fp != NULL);
1196         (void) memmove(fp, fp + len + 1,
1197                                 cs->smultis - (fp + len + 1 - cs->multis));
1198         cs->smultis -= len;
1199 
1200         if (cs->smultis == 0) {
1201                 free(cs->multis);
1202                 cs->multis = NULL;
1203                 return;
1204         }
1205 
1206         cs->multis = realloc(cs->multis, cs->smultis);
1207         assert(cs->multis != NULL);
1208 }
1209 
1210 /*
1211  - mcin - is a collating element in a cset?
1212  == static int mcin(register cset *cs, register unsigned char *cp);
1213  */
1214 static int
1215 mcin(cs, cp)
1216 register cset *cs;
1217 register unsigned char *cp;
1218 {
1219         return(mcfind(cs, cp) != NULL);
1220 }
1221 
1222 
1223 /*
1224  - mcfind - find a collating element in a cset
1225  == static unsigned char *mcfind(register cset *cs, register unsigned char *cp);
1226  */
1227 static unsigned char *
1228 mcfind(cs, cp)
1229 register cset *cs;
1230 register unsigned char *cp;
1231 {
1232         register unsigned char *p;
1233 
1234         if (cs->multis == NULL)
1235                 return(NULL);
1236         for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1237                 if (strcmp(cp, p) == 0)
1238                         return(p);
1239         return(NULL);
1240 }
1241 #endif
1242 
1243 /*
1244  - mcinvert - invert the list of collating elements in a cset
1245  == static void mcinvert(register struct parse *p, register cset *cs);
1246  *
1247  * This would have to know the set of possibilities.  Implementation
1248  * is deferred.
1249  */
1250 static void
1251 mcinvert(p, cs)
1252 register struct parse *p;
1253 register cset *cs;
1254 {
1255         assert(cs->multis == NULL);     /* xxx */
1256 }
1257 
1258 /*
1259  - mccase - add case counterparts of the list of collating elements in a cset
1260  == static void mccase(register struct parse *p, register cset *cs);
1261  *
1262  * This would have to know the set of possibilities.  Implementation
1263  * is deferred.
1264  */
1265 static void
1266 mccase(p, cs)
1267 register struct parse *p;
1268 register cset *cs;
1269 {
1270         assert(cs->multis == NULL);     /* xxx */
1271 }
1272 
1273 /*
1274  - isinsets - is this character in any sets?
1275  == static int isinsets(register struct re_guts *g, int c);
1276  */
1277 static int                      /* predicate */
1278 isinsets(g, c)
1279 register struct re_guts *g;
1280 int c;
1281 {
1282         register uch *col;
1283         register int i;
1284         register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1285         register unsigned uc = (unsigned char)c;
1286 
1287         if (!g->setbits) {
1288                 return(0);
1289         }
1290 
1291         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1292                 if (col[uc] != 0)
1293                         return(1);
1294         return(0);
1295 }
1296 
1297 /*
1298  - samesets - are these two characters in exactly the same sets?
1299  == static int samesets(register struct re_guts *g, int c1, int c2);
1300  */
1301 static int                      /* predicate */
1302 samesets(g, c1, c2)
1303 register struct re_guts *g;
1304 int c1;
1305 int c2;
1306 {
1307         register uch *col;
1308         register int i;
1309         register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1310         register unsigned uc1 = (unsigned char)c1;
1311         register unsigned uc2 = (unsigned char)c2;
1312 
1313         for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1314                 if (col[uc1] != col[uc2])
1315                         return(0);
1316         return(1);
1317 }
1318 
1319 /*
1320  - categorize - sort out character categories
1321  == static void categorize(struct parse *p, register struct re_guts *g);
1322  */
1323 static void
1324 categorize(p, g)
1325 struct parse *p;
1326 register struct re_guts *g;
1327 {
1328         register cat_t *cats = g->categories;
1329         register int c;
1330         register int c2;
1331         register cat_t cat;
1332 
1333         /* avoid making error situations worse */
1334         if (p->error != 0)
1335                 return;
1336 
1337         for (c = 0; c <= UCHAR_MAX; c++)
1338                 if (cats[c] == 0 && isinsets(g, c)) {
1339                         cat = g->ncategories++;
1340                         cats[c] = cat;
1341                         for (c2 = c+1; c2 <= UCHAR_MAX; c2++)
1342                                 if (cats[c2] == 0 && samesets(g, c, c2))
1343                                         cats[c2] = cat;
1344                 }
1345 }
1346 
1347 /*
1348  - dupl - emit a duplicate of a bunch of sops
1349  == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1350  */
1351 static sopno                    /* start of duplicate */
1352 dupl(p, start, finish)
1353 register struct parse *p;
1354 sopno start;                    /* from here */
1355 sopno finish;                   /* to this less one */
1356 {
1357         register sopno ret = HERE();
1358         register sopno len = finish - start;
1359 
1360         assert(finish >= start);
1361         if (len == 0)
1362                 return(ret);
1363         enlarge(p, p->ssize + len);     /* this many unexpected additions */
1364         assert(p->ssize >= p->slen + len);
1365         (void) memcpy((char *)(p->strip + p->slen),
1366                 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1367         p->slen += len;
1368         return(ret);
1369 }
1370 
1371 /*
1372  - doemit - emit a strip operator
1373  == static void doemit(register struct parse *p, sop op, size_t opnd);
1374  *
1375  * It might seem better to implement this as a macro with a function as
1376  * hard-case backup, but it's just too big and messy unless there are
1377  * some changes to the data structures.  Maybe later.
1378  */
1379 static void
1380 doemit(p, op, opnd)
1381 register struct parse *p;
1382 sop op;
1383 size_t opnd;
1384 {
1385         /* avoid making error situations worse */
1386         if (p->error != 0)
1387                 return;
1388 
1389         /* deal with oversize operands ("can't happen", more or less) */
1390         assert(opnd < 1<<OPSHIFT);
1391 
1392         /* deal with undersized strip */
1393         if (p->slen >= p->ssize)
1394                 enlarge(p, (p->ssize+1) / 2 * 3);       /* +50% */
1395         assert(p->slen < p->ssize);
1396 
1397         /* finally, it's all reduced to the easy case */
1398         p->strip[p->slen++] = SOP(op, opnd);
1399 }
1400 
1401 /*
1402  - doinsert - insert a sop into the strip
1403  == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1404  */
1405 static void
1406 doinsert(p, op, opnd, pos)
1407 register struct parse *p;
1408 sop op;
1409 size_t opnd;
1410 sopno pos;
1411 {
1412         register sopno sn;
1413         register sop s;
1414         register int i;
1415 
1416         /* avoid making error situations worse */
1417         if (p->error != 0)
1418                 return;
1419 
1420         sn = HERE();
1421         EMIT(op, opnd);         /* do checks, ensure space */
1422         assert(HERE() == sn+1);
1423         s = p->strip[sn];
1424 
1425         /* adjust paren pointers */
1426         assert(pos > 0);
1427         for (i = 1; i < NPAREN; i++) {
1428                 if (p->pbegin[i] >= pos) {
1429                         p->pbegin[i]++;
1430                 }
1431                 if (p->pend[i] >= pos) {
1432                         p->pend[i]++;
1433                 }
1434         }
1435 
1436         memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1437                                                 (HERE()-pos-1)*sizeof(sop));
1438         p->strip[pos] = s;
1439 }
1440 
1441 /*
1442  - dofwd - complete a forward reference
1443  == static void dofwd(register struct parse *p, sopno pos, sop value);
1444  */
1445 static void
1446 dofwd(p, pos, value)
1447 register struct parse *p;
1448 register sopno pos;
1449 sop value;
1450 {
1451         /* avoid making error situations worse */
1452         if (p->error != 0)
1453                 return;
1454 
1455         assert(value < 1<<OPSHIFT);
1456         p->strip[pos] = OP(p->strip[pos]) | value;
1457 }
1458 
1459 /*
1460  - enlarge - enlarge the strip
1461  == static void enlarge(register struct parse *p, sopno size);
1462  */
1463 static void
1464 enlarge(p, size)
1465 register struct parse *p;
1466 register sopno size;
1467 {
1468         register sop *sp;
1469 
1470         if (p->ssize >= size)
1471                 return;
1472 
1473         sp = (sop *)realloc(p->strip, size*sizeof(sop));
1474         if (sp == NULL) {
1475                 SETERROR(REG_ESPACE);
1476                 return;
1477         }
1478         p->strip = sp;
1479         p->ssize = size;
1480 }
1481 
1482 /*
1483  - stripsnug - compact the strip
1484  == static void stripsnug(register struct parse *p, register struct re_guts *g);
1485  */
1486 static void
1487 stripsnug(p, g)
1488 register struct parse *p;
1489 register struct re_guts *g;
1490 {
1491         g->nstates = p->slen;
1492         g->strip = (sop *)realloc((unsigned char *)p->strip, p->slen * sizeof(sop));
1493         if (g->strip == NULL) {
1494                 SETERROR(REG_ESPACE);
1495                 g->strip = p->strip;
1496         }
1497 }
1498 
1499 /*
1500  - findmust - fill in must and mlen with longest mandatory literal string
1501  == static void findmust(register struct parse *p, register struct re_guts *g);
1502  *
1503  * This algorithm could do fancy things like analyzing the operands of |
1504  * for common subsequences.  Someday.  This code is simple and finds most
1505  * of the interesting cases.
1506  *
1507  * Note that must and mlen got initialized during setup.
1508  */
1509 static void
1510 findmust(p, g)
1511 struct parse *p;
1512 register struct re_guts *g;
1513 {
1514         register sop *scan;
1515         sop *start = NULL;
1516         register sop *newstart = NULL;
1517         register sopno newlen;
1518         register sop s;
1519         register unsigned char *cp;
1520         register sopno i;
1521 
1522         /* avoid making error situations worse */
1523         if (p->error != 0)
1524                 return;
1525 
1526         /* find the longest OCHAR sequence in strip */
1527         newlen = 0;
1528         scan = g->strip + 1;
1529         do {
1530                 s = *scan++;
1531                 switch (OP(s)) {
1532                 case OCHAR:             /* sequence member */
1533                         if (newlen == 0)                /* new sequence */
1534                                 newstart = scan - 1;
1535                         newlen++;
1536                         break;
1537                 case OPLUS_:            /* things that don't break one */
1538                 case OLPAREN:
1539                 case ORPAREN:
1540                         break;
1541                 case OQUEST_:           /* things that must be skipped */
1542                 case OCH_:
1543                         scan--;
1544                         do {
1545                                 scan += OPND(s);
1546                                 s = *scan;
1547                                 /* assert() interferes w debug printouts */
1548                                 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1549                                                         OP(s) != OOR2) {
1550                                         g->iflags |= BAD;
1551                                         return;
1552                                 }
1553                         } while (OP(s) != O_QUEST && OP(s) != O_CH);
1554                         /* fallthrough */
1555                 default:                /* things that break a sequence */
1556                         if (newlen > g->mlen) {         /* ends one */
1557                                 start = newstart;
1558                                 g->mlen = newlen;
1559                         }
1560                         newlen = 0;
1561                         break;
1562                 }
1563         } while (OP(s) != OEND);
1564 
1565         if (g->mlen == 0)               /* there isn't one */
1566                 return;
1567 
1568         if (!start) {
1569                 g->mlen = 0;
1570                 return;
1571         }
1572 
1573         /* turn it into a character string */
1574         g->must = malloc((size_t)g->mlen + 1);
1575         if (g->must == NULL) {          /* argh; just forget it */
1576                 g->mlen = 0;
1577                 return;
1578         }
1579         cp = g->must;
1580         scan = start;
1581         for (i = g->mlen; i > 0; i--) {
1582                 while (OP(s = *scan++) != OCHAR)
1583                         continue;
1584                 assert(cp < g->must + g->mlen);
1585                 *cp++ = (unsigned char)OPND(s);
1586         }
1587         assert(cp == g->must + g->mlen);
1588         *cp++ = '\0';           /* just on general principles */
1589 }
1590 
1591 /*
1592  - pluscount - count + nesting
1593  == static sopno pluscount(register struct parse *p, register struct re_guts *g);
1594  */
1595 static sopno                    /* nesting depth */
1596 pluscount(p, g)
1597 struct parse *p;
1598 register struct re_guts *g;
1599 {
1600         register sop *scan;
1601         register sop s;
1602         register sopno plusnest = 0;
1603         register sopno maxnest = 0;
1604 
1605         if (p->error != 0)
1606                 return(0);      /* there may not be an OEND */
1607 
1608         scan = g->strip + 1;
1609         do {
1610                 s = *scan++;
1611                 switch (OP(s)) {
1612                 case OPLUS_:
1613                         plusnest++;
1614                         break;
1615                 case O_PLUS:
1616                         if (plusnest > maxnest)
1617                                 maxnest = plusnest;
1618                         plusnest--;
1619                         break;
1620                 }
1621         } while (OP(s) != OEND);
1622         if (plusnest != 0)
1623                 g->iflags |= BAD;
1624         return(maxnest);
1625 }

/* [<][>][^][v][top][bottom][index][help] */