root/ext/pcre/php_pcre.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ZEND_DECLARE_MODULE_GLOBALS
  2. php_free_pcre_cache
  3. PHP_GINIT_FUNCTION
  4. PHP_GSHUTDOWN_FUNCTION
  5. PHP_INI_BEGIN
  6. PHP_MINIT_FUNCTION
  7. PHP_MSHUTDOWN_FUNCTION
  8. pcre_clean_cache
  9. make_subpats_table
  10. calculate_unit_length
  11. pcre_get_compiled_regex_cache
  12. pcre_get_compiled_regex
  13. pcre_get_compiled_regex_ex
  14. add_offset_pair
  15. php_do_pcre_match
  16. php_pcre_match_impl
  17. PHP_FUNCTION
  18. PHP_FUNCTION
  19. preg_get_backref
  20. preg_do_repl_func
  21. preg_do_eval
  22. php_pcre_replace
  23. php_pcre_replace_impl
  24. php_replace_in_subject
  25. preg_replace_impl
  26. PHP_FUNCTION
  27. PHP_FUNCTION
  28. PHP_FUNCTION
  29. PHP_FUNCTION
  30. php_pcre_split_impl
  31. PHP_FUNCTION
  32. PHP_FUNCTION
  33. php_pcre_grep_impl
  34. PHP_FUNCTION

   1 /*
   2    +----------------------------------------------------------------------+
   3    | PHP Version 5                                                        |
   4    +----------------------------------------------------------------------+
   5    | Copyright (c) 1997-2016 The PHP Group                                |
   6    +----------------------------------------------------------------------+
   7    | This source file is subject to version 3.01 of the PHP license,      |
   8    | that is bundled with this package in the file LICENSE, and is        |
   9    | available through the world-wide-web at the following url:           |
  10    | http://www.php.net/license/3_01.txt                                  |
  11    | If you did not receive a copy of the PHP license and are unable to   |
  12    | obtain it through the world-wide-web, please send a note to          |
  13    | license@php.net so we can mail you a copy immediately.               |
  14    +----------------------------------------------------------------------+
  15    | Author: Andrei Zmievski <andrei@php.net>                             |
  16    +----------------------------------------------------------------------+
  17  */
  18 
  19 /* $Id$ */
  20 
  21 #include "php.h"
  22 #include "php_ini.h"
  23 #include "php_globals.h"
  24 #include "php_pcre.h"
  25 #include "ext/standard/info.h"
  26 #include "ext/standard/php_smart_str.h"
  27 
  28 #if HAVE_PCRE || HAVE_BUNDLED_PCRE
  29 
  30 #include "ext/standard/php_string.h"
  31 
  32 #define PREG_PATTERN_ORDER                      1
  33 #define PREG_SET_ORDER                          2
  34 #define PREG_OFFSET_CAPTURE                     (1<<8)
  35 
  36 #define PREG_SPLIT_NO_EMPTY                     (1<<0)
  37 #define PREG_SPLIT_DELIM_CAPTURE        (1<<1)
  38 #define PREG_SPLIT_OFFSET_CAPTURE       (1<<2)
  39 
  40 #define PREG_REPLACE_EVAL                       (1<<0)
  41 
  42 #define PREG_GREP_INVERT                        (1<<0)
  43 
  44 #define PCRE_CACHE_SIZE 4096
  45 
  46 /* not fully functional workaround for libpcre < 8.0, see bug #70232 */
  47 #ifndef PCRE_NOTEMPTY_ATSTART
  48 # define PCRE_NOTEMPTY_ATSTART PCRE_NOTEMPTY
  49 #endif
  50 
  51 enum {
  52         PHP_PCRE_NO_ERROR = 0,
  53         PHP_PCRE_INTERNAL_ERROR,
  54         PHP_PCRE_BACKTRACK_LIMIT_ERROR,
  55         PHP_PCRE_RECURSION_LIMIT_ERROR,
  56         PHP_PCRE_BAD_UTF8_ERROR,
  57         PHP_PCRE_BAD_UTF8_OFFSET_ERROR
  58 };
  59 
  60 
  61 ZEND_DECLARE_MODULE_GLOBALS(pcre)
  62 
  63 
  64 static void pcre_handle_exec_error(int pcre_code TSRMLS_DC) /* {{{ */
  65 {
  66         int preg_code = 0;
  67 
  68         switch (pcre_code) {
  69                 case PCRE_ERROR_MATCHLIMIT:
  70                         preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
  71                         break;
  72 
  73                 case PCRE_ERROR_RECURSIONLIMIT:
  74                         preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
  75                         break;
  76 
  77                 case PCRE_ERROR_BADUTF8:
  78                         preg_code = PHP_PCRE_BAD_UTF8_ERROR;
  79                         break;
  80 
  81                 case PCRE_ERROR_BADUTF8_OFFSET:
  82                         preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
  83                         break;
  84 
  85                 default:
  86                         preg_code = PHP_PCRE_INTERNAL_ERROR;
  87                         break;
  88         }
  89 
  90         PCRE_G(error_code) = preg_code;
  91 }
  92 /* }}} */
  93 
  94 static void php_free_pcre_cache(void *data) /* {{{ */
  95 {
  96         pcre_cache_entry *pce = (pcre_cache_entry *) data;
  97         if (!pce) return;
  98         pefree(pce->re, 1);
  99         if (pce->extra) pefree(pce->extra, 1);
 100 #if HAVE_SETLOCALE
 101         if ((void*)pce->tables) pefree((void*)pce->tables, 1);
 102         pefree(pce->locale, 1);
 103 #endif
 104 }
 105 /* }}} */
 106 
 107 static PHP_GINIT_FUNCTION(pcre) /* {{{ */
 108 {
 109         zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
 110         pcre_globals->backtrack_limit = 0;
 111         pcre_globals->recursion_limit = 0;
 112         pcre_globals->error_code      = PHP_PCRE_NO_ERROR;
 113 }
 114 /* }}} */
 115 
 116 static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
 117 {
 118         zend_hash_destroy(&pcre_globals->pcre_cache);
 119 }
 120 /* }}} */
 121 
 122 PHP_INI_BEGIN()
 123         STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateLong, backtrack_limit, zend_pcre_globals, pcre_globals)
 124         STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000", PHP_INI_ALL, OnUpdateLong, recursion_limit, zend_pcre_globals, pcre_globals)
 125 PHP_INI_END()
 126 
 127 
 128 /* {{{ PHP_MINFO_FUNCTION(pcre) */
 129 static PHP_MINFO_FUNCTION(pcre)
 130 {
 131         php_info_print_table_start();
 132         php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
 133         php_info_print_table_row(2, "PCRE Library Version", pcre_version() );
 134         php_info_print_table_end();
 135 
 136         DISPLAY_INI_ENTRIES();
 137 }
 138 /* }}} */
 139 
 140 /* {{{ PHP_MINIT_FUNCTION(pcre) */
 141 static PHP_MINIT_FUNCTION(pcre)
 142 {
 143         REGISTER_INI_ENTRIES();
 144 
 145         REGISTER_LONG_CONSTANT("PREG_PATTERN_ORDER", PREG_PATTERN_ORDER, CONST_CS | CONST_PERSISTENT);
 146         REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER, CONST_CS | CONST_PERSISTENT);
 147         REGISTER_LONG_CONSTANT("PREG_OFFSET_CAPTURE", PREG_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
 148         REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY, CONST_CS | CONST_PERSISTENT);
 149         REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE, CONST_CS | CONST_PERSISTENT);
 150         REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
 151         REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT, CONST_CS | CONST_PERSISTENT);
 152 
 153         REGISTER_LONG_CONSTANT("PREG_NO_ERROR", PHP_PCRE_NO_ERROR, CONST_CS | CONST_PERSISTENT);
 154         REGISTER_LONG_CONSTANT("PREG_INTERNAL_ERROR", PHP_PCRE_INTERNAL_ERROR, CONST_CS | CONST_PERSISTENT);
 155         REGISTER_LONG_CONSTANT("PREG_BACKTRACK_LIMIT_ERROR", PHP_PCRE_BACKTRACK_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
 156         REGISTER_LONG_CONSTANT("PREG_RECURSION_LIMIT_ERROR", PHP_PCRE_RECURSION_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
 157         REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_ERROR", PHP_PCRE_BAD_UTF8_ERROR, CONST_CS | CONST_PERSISTENT);
 158         REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_OFFSET_ERROR", PHP_PCRE_BAD_UTF8_OFFSET_ERROR, CONST_CS | CONST_PERSISTENT);
 159         REGISTER_STRING_CONSTANT("PCRE_VERSION", (char *)pcre_version(), CONST_CS | CONST_PERSISTENT);
 160 
 161         return SUCCESS;
 162 }
 163 /* }}} */
 164 
 165 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
 166 static PHP_MSHUTDOWN_FUNCTION(pcre)
 167 {
 168         UNREGISTER_INI_ENTRIES();
 169 
 170         return SUCCESS;
 171 }
 172 /* }}} */
 173 
 174 /* {{{ static pcre_clean_cache */
 175 static int pcre_clean_cache(void *data, void *arg TSRMLS_DC)
 176 {
 177         pcre_cache_entry *pce = (pcre_cache_entry *) data;
 178         int *num_clean = (int *)arg;
 179 
 180         if (*num_clean > 0 && !pce->refcount) {
 181                 (*num_clean)--;
 182                 return ZEND_HASH_APPLY_REMOVE;
 183         } else {
 184                 return ZEND_HASH_APPLY_KEEP;
 185         }
 186 }
 187 /* }}} */
 188 
 189 /* {{{ static make_subpats_table */
 190 static char **make_subpats_table(int num_subpats, pcre_cache_entry *pce TSRMLS_DC)
 191 {
 192         pcre_extra *extra = pce->extra;
 193         int name_cnt = 0, name_size, ni = 0;
 194         int rc;
 195         char *name_table;
 196         unsigned short name_idx;
 197         char **subpat_names = (char **)ecalloc(num_subpats, sizeof(char *));
 198 
 199         rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMECOUNT, &name_cnt);
 200         if (rc < 0) {
 201                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
 202                 efree(subpat_names);
 203                 return NULL;
 204         }
 205         if (name_cnt > 0) {
 206                 int rc1, rc2;
 207 
 208                 rc1 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMETABLE, &name_table);
 209                 rc2 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
 210                 rc = rc2 ? rc2 : rc1;
 211                 if (rc < 0) {
 212                         php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
 213                         efree(subpat_names);
 214                         return NULL;
 215                 }
 216 
 217                 while (ni++ < name_cnt) {
 218                         name_idx = 0xff * (unsigned char)name_table[0] + (unsigned char)name_table[1];
 219                         subpat_names[name_idx] = name_table + 2;
 220                         if (is_numeric_string(subpat_names[name_idx], strlen(subpat_names[name_idx]), NULL, NULL, 0) > 0) {
 221                                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Numeric named subpatterns are not allowed");
 222                                 efree(subpat_names);
 223                                 return NULL;
 224                         }
 225                         name_table += name_size;
 226                 }
 227         }
 228 
 229         return subpat_names;
 230 }
 231 /* }}} */
 232 
 233 /* {{{ static calculate_unit_length */
 234 /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE_UTF8. */
 235 static zend_always_inline int calculate_unit_length(pcre_cache_entry *pce, char *start)
 236 {
 237         int unit_len;
 238 
 239         if (pce->compile_options & PCRE_UTF8) {
 240                 char *end = start;
 241 
 242                 /* skip continuation bytes */
 243                 while ((*++end & 0xC0) == 0x80);
 244                 unit_len = end - start;
 245         } else {
 246                 unit_len = 1;
 247         }
 248         return unit_len;
 249 }
 250 /* }}} */
 251 
 252 /* {{{ pcre_get_compiled_regex_cache
 253  */
 254 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_len TSRMLS_DC)
 255 {
 256         pcre                            *re = NULL;
 257         pcre_extra                      *extra;
 258         int                                      coptions = 0;
 259         int                                      soptions = 0;
 260         const char                      *error;
 261         int                                      erroffset;
 262         char                             delimiter;
 263         char                             start_delimiter;
 264         char                             end_delimiter;
 265         char                            *p, *pp;
 266         char                            *pattern;
 267         int                                      do_study = 0;
 268         int                                      poptions = 0;
 269         int                             count = 0;
 270         unsigned const char *tables = NULL;
 271 #if HAVE_SETLOCALE
 272         char                            *locale;
 273 #endif
 274         pcre_cache_entry        *pce;
 275         pcre_cache_entry         new_entry;
 276         char                *tmp = NULL;
 277 
 278 #if HAVE_SETLOCALE
 279 # if defined(PHP_WIN32) && defined(ZTS)
 280         _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
 281 # endif
 282         locale = setlocale(LC_CTYPE, NULL);
 283 #endif
 284 
 285         /* Try to lookup the cached regex entry, and if successful, just pass
 286            back the compiled pattern, otherwise go on and compile it. */
 287         if (zend_hash_find(&PCRE_G(pcre_cache), regex, regex_len+1, (void **)&pce) == SUCCESS) {
 288                 /*
 289                  * We use a quick pcre_fullinfo() check to see whether cache is corrupted, and if it
 290                  * is, we flush it and compile the pattern from scratch.
 291                  */
 292                 if (pcre_fullinfo(pce->re, NULL, PCRE_INFO_CAPTURECOUNT, &count) == PCRE_ERROR_BADMAGIC) {
 293                         zend_hash_clean(&PCRE_G(pcre_cache));
 294                 } else {
 295 #if HAVE_SETLOCALE
 296                         if (!strcmp(pce->locale, locale)) {
 297 #endif
 298                                 return pce;
 299 #if HAVE_SETLOCALE
 300                         }
 301 #endif
 302                 }
 303         }
 304 
 305         p = regex;
 306 
 307         /* Parse through the leading whitespace, and display a warning if we
 308            get to the end without encountering a delimiter. */
 309         while (isspace((int)*(unsigned char *)p)) p++;
 310         if (*p == 0) {
 311                 php_error_docref(NULL TSRMLS_CC, E_WARNING,
 312                                                  p < regex + regex_len ? "Null byte in regex" : "Empty regular expression");
 313                 return NULL;
 314         }
 315 
 316         /* Get the delimiter and display a warning if it is alphanumeric
 317            or a backslash. */
 318         delimiter = *p++;
 319         if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
 320                 php_error_docref(NULL TSRMLS_CC,E_WARNING, "Delimiter must not be alphanumeric or backslash");
 321                 return NULL;
 322         }
 323 
 324         start_delimiter = delimiter;
 325         if ((pp = strchr("([{< )]}> )]}>", delimiter)))
 326                 delimiter = pp[5];
 327         end_delimiter = delimiter;
 328 
 329         pp = p;
 330 
 331         if (start_delimiter == end_delimiter) {
 332                 /* We need to iterate through the pattern, searching for the ending delimiter,
 333                    but skipping the backslashed delimiters.  If the ending delimiter is not
 334                    found, display a warning. */
 335                 while (*pp != 0) {
 336                         if (*pp == '\\' && pp[1] != 0) pp++;
 337                         else if (*pp == delimiter)
 338                                 break;
 339                         pp++;
 340                 }
 341         } else {
 342                 /* We iterate through the pattern, searching for the matching ending
 343                  * delimiter. For each matching starting delimiter, we increment nesting
 344                  * level, and decrement it for each matching ending delimiter. If we
 345                  * reach the end of the pattern without matching, display a warning.
 346                  */
 347                 int brackets = 1;       /* brackets nesting level */
 348                 while (*pp != 0) {
 349                         if (*pp == '\\' && pp[1] != 0) pp++;
 350                         else if (*pp == end_delimiter && --brackets <= 0)
 351                                 break;
 352                         else if (*pp == start_delimiter)
 353                                 brackets++;
 354                         pp++;
 355                 }
 356         }
 357 
 358         if (*pp == 0) {
 359                 if (pp < regex + regex_len) {
 360                         php_error_docref(NULL TSRMLS_CC,E_WARNING, "Null byte in regex");
 361                 } else if (start_delimiter == end_delimiter) {
 362                         php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending delimiter '%c' found", delimiter);
 363                 } else {
 364                         php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending matching delimiter '%c' found", delimiter);
 365                 }
 366                 return NULL;
 367         }
 368 
 369         /* Make a copy of the actual pattern. */
 370         pattern = estrndup(p, pp-p);
 371 
 372         /* Move on to the options */
 373         pp++;
 374 
 375         /* Parse through the options, setting appropriate flags.  Display
 376            a warning if we encounter an unknown modifier. */
 377         while (pp < regex + regex_len) {
 378                 switch (*pp++) {
 379                         /* Perl compatible options */
 380                         case 'i':       coptions |= PCRE_CASELESS;              break;
 381                         case 'm':       coptions |= PCRE_MULTILINE;             break;
 382                         case 's':       coptions |= PCRE_DOTALL;                break;
 383                         case 'x':       coptions |= PCRE_EXTENDED;              break;
 384 
 385                         /* PCRE specific options */
 386                         case 'A':       coptions |= PCRE_ANCHORED;              break;
 387                         case 'D':       coptions |= PCRE_DOLLAR_ENDONLY;break;
 388                         case 'S':       do_study  = 1;                                  break;
 389                         case 'U':       coptions |= PCRE_UNGREEDY;              break;
 390                         case 'X':       coptions |= PCRE_EXTRA;                 break;
 391                         case 'u':       coptions |= PCRE_UTF8;
 392         /* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
 393        characters, even in UTF-8 mode. However, this can be changed by setting
 394        the PCRE_UCP option. */
 395 #ifdef PCRE_UCP
 396                                                 coptions |= PCRE_UCP;
 397 #endif
 398                                 break;
 399 
 400                         /* Custom preg options */
 401                         case 'e':       poptions |= PREG_REPLACE_EVAL;  break;
 402 
 403                         case ' ':
 404                         case '\n':
 405                                 break;
 406 
 407                         default:
 408                                 if (pp[-1]) {
 409                                         php_error_docref(NULL TSRMLS_CC,E_WARNING, "Unknown modifier '%c'", pp[-1]);
 410                                 } else {
 411                                         php_error_docref(NULL TSRMLS_CC,E_WARNING, "Null byte in regex");
 412                                 }
 413                                 efree(pattern);
 414                                 return NULL;
 415                 }
 416         }
 417 
 418 #if HAVE_SETLOCALE
 419         if (strcmp(locale, "C"))
 420                 tables = pcre_maketables();
 421 #endif
 422 
 423         /* Compile pattern and display a warning if compilation failed. */
 424         re = pcre_compile(pattern,
 425                                           coptions,
 426                                           &error,
 427                                           &erroffset,
 428                                           tables);
 429 
 430         if (re == NULL) {
 431                 php_error_docref(NULL TSRMLS_CC,E_WARNING, "Compilation failed: %s at offset %d", error, erroffset);
 432                 efree(pattern);
 433                 if (tables) {
 434                         pefree((void*)tables, 1);
 435                 }
 436                 return NULL;
 437         }
 438 
 439         /* If study option was specified, study the pattern and
 440            store the result in extra for passing to pcre_exec. */
 441         if (do_study) {
 442                 extra = pcre_study(re, soptions, &error);
 443                 if (extra) {
 444                         extra->flags |= PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
 445                 }
 446                 if (error != NULL) {
 447                         php_error_docref(NULL TSRMLS_CC, E_WARNING, "Error while studying pattern");
 448                 }
 449         } else {
 450                 extra = NULL;
 451         }
 452 
 453         efree(pattern);
 454 
 455         /*
 456          * If we reached cache limit, clean out the items from the head of the list;
 457          * these are supposedly the oldest ones (but not necessarily the least used
 458          * ones).
 459          */
 460         if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
 461                 int num_clean = PCRE_CACHE_SIZE / 8;
 462                 zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean TSRMLS_CC);
 463         }
 464 
 465         /* Store the compiled pattern and extra info in the cache. */
 466         new_entry.re = re;
 467         new_entry.extra = extra;
 468         new_entry.preg_options = poptions;
 469         new_entry.compile_options = coptions;
 470 #if HAVE_SETLOCALE
 471         new_entry.locale = pestrdup(locale, 1);
 472         new_entry.tables = tables;
 473 #endif
 474         new_entry.refcount = 0;
 475 
 476         /*
 477          * Interned strings are not duplicated when stored in HashTable,
 478          * but all the interned strings created during HTTP request are removed
 479          * at end of request. However PCRE_G(pcre_cache) must be consistent
 480          * on the next request as well. So we disable usage of interned strings
 481          * as hash keys especually for this table.
 482          * See bug #63180
 483          */
 484         if (IS_INTERNED(regex)) {
 485                 regex = tmp = estrndup(regex, regex_len);
 486         }
 487 
 488         zend_hash_update(&PCRE_G(pcre_cache), regex, regex_len+1, (void *)&new_entry,
 489                                                 sizeof(pcre_cache_entry), (void**)&pce);
 490 
 491         if (tmp) {
 492                 efree(tmp);
 493         }
 494 
 495         return pce;
 496 }
 497 /* }}} */
 498 
 499 /* {{{ pcre_get_compiled_regex
 500  */
 501 PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_options TSRMLS_DC)
 502 {
 503         pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
 504 
 505         if (extra) {
 506                 *extra = pce ? pce->extra : NULL;
 507         }
 508         if (preg_options) {
 509                 *preg_options = pce ? pce->preg_options : 0;
 510         }
 511 
 512         return pce ? pce->re : NULL;
 513 }
 514 /* }}} */
 515 
 516 /* {{{ pcre_get_compiled_regex_ex
 517  */
 518 PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *compile_options TSRMLS_DC)
 519 {
 520         pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
 521 
 522         if (extra) {
 523                 *extra = pce ? pce->extra : NULL;
 524         }
 525         if (preg_options) {
 526                 *preg_options = pce ? pce->preg_options : 0;
 527         }
 528         if (compile_options) {
 529                 *compile_options = pce ? pce->compile_options : 0;
 530         }
 531 
 532         return pce ? pce->re : NULL;
 533 }
 534 /* }}} */
 535 
 536 /* {{{ add_offset_pair */
 537 static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name)
 538 {
 539         zval *match_pair;
 540 
 541         ALLOC_ZVAL(match_pair);
 542         array_init(match_pair);
 543         INIT_PZVAL(match_pair);
 544 
 545         /* Add (match, offset) to the return value */
 546         add_next_index_stringl(match_pair, str, len, 1);
 547         add_next_index_long(match_pair, offset);
 548 
 549         if (name) {
 550                 zval_add_ref(&match_pair);
 551                 zend_hash_update(Z_ARRVAL_P(result), name, strlen(name)+1, &match_pair, sizeof(zval *), NULL);
 552         }
 553         zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair, sizeof(zval *), NULL);
 554 }
 555 /* }}} */
 556 
 557 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ */
 558 {
 559         /* parameters */
 560         char                     *regex;                        /* Regular expression */
 561         char                     *subject;                      /* String to match against */
 562         int                               regex_len;
 563         int                               subject_len;
 564         pcre_cache_entry *pce;                          /* Compiled regular expression */
 565         zval                     *subpats = NULL;       /* Array for subpatterns */
 566         long                      flags = 0;            /* Match control flags */
 567         long                      start_offset = 0;     /* Where the new search starts */
 568 
 569         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|zll", &regex, &regex_len,
 570                                                           &subject, &subject_len, &subpats, &flags, &start_offset) == FAILURE) {
 571                 RETURN_FALSE;
 572         }
 573 
 574         /* Compile regex or get it from cache. */
 575         if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
 576                 RETURN_FALSE;
 577         }
 578 
 579         pce->refcount++;
 580         php_pcre_match_impl(pce, subject, subject_len, return_value, subpats,
 581                 global, ZEND_NUM_ARGS() >= 4, flags, start_offset TSRMLS_CC);
 582         pce->refcount--;
 583 }
 584 /* }}} */
 585 
 586 /* {{{ php_pcre_match_impl() */
 587 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
 588         zval *subpats, int global, int use_flags, long flags, long start_offset TSRMLS_DC)
 589 {
 590         zval                    *result_set,            /* Holds a set of subpatterns after
 591                                                                                    a global match */
 592                                    **match_sets = NULL; /* An array of sets of matches for each
 593                                                                                    subpattern after a global match */
 594         pcre_extra              *extra = pce->extra;/* Holds results of studying */
 595         pcre_extra               extra_data;            /* Used locally for exec options */
 596         int                              exoptions = 0;         /* Execution options */
 597         int                              count = 0;                     /* Count of matched subpatterns */
 598         int                             *offsets;                       /* Array of subpattern offsets */
 599         int                              num_subpats;           /* Number of captured subpatterns */
 600         int                              size_offsets;          /* Size of the offsets array */
 601         int                              matched;                       /* Has anything matched */
 602         int                              g_notempty = 0;        /* If the match should not be empty */
 603         const char         **stringlist;                /* Holds list of subpatterns */
 604         char               **subpat_names;              /* Array for named subpatterns */
 605         int                              i, rc;
 606         int                              subpats_order;         /* Order of subpattern matches */
 607         int                              offset_capture;    /* Capture match offsets: yes/no */
 608         unsigned char   *mark = NULL;       /* Target for MARK name */
 609         zval            *marks = NULL;      /* Array of marks for PREG_PATTERN_ORDER */
 610 
 611         /* Overwrite the passed-in value for subpatterns with an empty array. */
 612         if (subpats != NULL) {
 613                 zval_dtor(subpats);
 614                 array_init(subpats);
 615         }
 616 
 617         subpats_order = global ? PREG_PATTERN_ORDER : 0;
 618 
 619         if (use_flags) {
 620                 offset_capture = flags & PREG_OFFSET_CAPTURE;
 621 
 622                 /*
 623                  * subpats_order is pre-set to pattern mode so we change it only if
 624                  * necessary.
 625                  */
 626                 if (flags & 0xff) {
 627                         subpats_order = flags & 0xff;
 628                 }
 629                 if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
 630                         (!global && subpats_order != 0)) {
 631                         php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid flags specified");
 632                         return;
 633                 }
 634         } else {
 635                 offset_capture = 0;
 636         }
 637 
 638         /* Negative offset counts from the end of the string. */
 639         if (start_offset < 0) {
 640                 start_offset = subject_len + start_offset;
 641                 if (start_offset < 0) {
 642                         start_offset = 0;
 643                 }
 644         }
 645 
 646         if (extra == NULL) {
 647                 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
 648                 extra = &extra_data;
 649         }
 650         extra->match_limit = PCRE_G(backtrack_limit);
 651         extra->match_limit_recursion = PCRE_G(recursion_limit);
 652 #ifdef PCRE_EXTRA_MARK
 653         extra->mark = &mark;
 654         extra->flags |= PCRE_EXTRA_MARK;
 655 #endif
 656 
 657         /* Calculate the size of the offsets array, and allocate memory for it. */
 658         rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
 659         if (rc < 0) {
 660                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
 661                 RETURN_FALSE;
 662         }
 663         num_subpats++;
 664         size_offsets = num_subpats * 3;
 665 
 666         /*
 667          * Build a mapping from subpattern numbers to their names. We will always
 668          * allocate the table, even though there may be no named subpatterns. This
 669          * avoids somewhat more complicated logic in the inner loops.
 670          */
 671         subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
 672         if (!subpat_names) {
 673                 RETURN_FALSE;
 674         }
 675 
 676         offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
 677         memset(offsets, 0, size_offsets*sizeof(int));
 678         /* Allocate match sets array and initialize the values. */
 679         if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
 680                 match_sets = (zval **)safe_emalloc(num_subpats, sizeof(zval *), 0);
 681                 for (i=0; i<num_subpats; i++) {
 682                         ALLOC_ZVAL(match_sets[i]);
 683                         array_init(match_sets[i]);
 684                         INIT_PZVAL(match_sets[i]);
 685                 }
 686         }
 687 
 688         matched = 0;
 689         PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
 690 
 691         do {
 692                 /* Execute the regular expression. */
 693                 count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
 694                                                   exoptions|g_notempty, offsets, size_offsets);
 695 
 696                 /* the string was already proved to be valid UTF-8 */
 697                 exoptions |= PCRE_NO_UTF8_CHECK;
 698 
 699                 /* Check for too many substrings condition. */
 700                 if (count == 0) {
 701                         php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
 702                         count = size_offsets/3;
 703                 }
 704 
 705                 /* If something has matched */
 706                 if (count > 0) {
 707                         matched++;
 708 
 709                         /* If subpatterns array has been passed, fill it in with values. */
 710                         if (subpats != NULL) {
 711                                 /* Try to get the list of substrings and display a warning if failed. */
 712                                 if ((offsets[1] - offsets[0] < 0) || pcre_get_substring_list(subject, offsets, count, &stringlist) < 0) {
 713                                         efree(subpat_names);
 714                                         efree(offsets);
 715                                         if (match_sets) efree(match_sets);
 716                                         php_error_docref(NULL TSRMLS_CC, E_WARNING, "Get subpatterns list failed");
 717                                         RETURN_FALSE;
 718                                 }
 719 
 720                                 if (global) {   /* global pattern matching */
 721                                         if (subpats && subpats_order == PREG_PATTERN_ORDER) {
 722                                                 /* For each subpattern, insert it into the appropriate array. */
 723                                                 for (i = 0; i < count; i++) {
 724                                                         if (offset_capture) {
 725                                                                 add_offset_pair(match_sets[i], (char *)stringlist[i],
 726                                                                                                 offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL);
 727                                                         } else {
 728                                                                 add_next_index_stringl(match_sets[i], (char *)stringlist[i],
 729                                                                                                            offsets[(i<<1)+1] - offsets[i<<1], 1);
 730                                                         }
 731                                                 }
 732                                                 /* Add MARK, if available */
 733                                                 if (mark) {
 734                                                         if (!marks) {
 735                                                                 MAKE_STD_ZVAL(marks);
 736                                                                 array_init(marks);
 737                                                         }
 738                                                         add_index_string(marks, matched - 1, (char *) mark, 1);
 739                                                 }
 740                                                 /*
 741                                                  * If the number of captured subpatterns on this run is
 742                                                  * less than the total possible number, pad the result
 743                                                  * arrays with empty strings.
 744                                                  */
 745                                                 if (count < num_subpats) {
 746                                                         for (; i < num_subpats; i++) {
 747                                                                 add_next_index_string(match_sets[i], "", 1);
 748                                                         }
 749                                                 }
 750                                         } else {
 751                                                 /* Allocate the result set array */
 752                                                 ALLOC_ZVAL(result_set);
 753                                                 array_init(result_set);
 754                                                 INIT_PZVAL(result_set);
 755 
 756                                                 /* Add all the subpatterns to it */
 757                                                 for (i = 0; i < count; i++) {
 758                                                         if (offset_capture) {
 759                                                                 add_offset_pair(result_set, (char *)stringlist[i],
 760                                                                                                 offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], subpat_names[i]);
 761                                                         } else {
 762                                                                 if (subpat_names[i]) {
 763                                                                         add_assoc_stringl(result_set, subpat_names[i], (char *)stringlist[i],
 764                                                                                                                    offsets[(i<<1)+1] - offsets[i<<1], 1);
 765                                                                 }
 766                                                                 add_next_index_stringl(result_set, (char *)stringlist[i],
 767                                                                                                            offsets[(i<<1)+1] - offsets[i<<1], 1);
 768                                                         }
 769                                                 }
 770                                                 /* Add MARK, if available */
 771                                                 if (mark) {
 772                                                         add_assoc_string(result_set, "MARK", (char *) mark, 1);
 773                                                 }
 774                                                 /* And add it to the output array */
 775                                                 zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &result_set, sizeof(zval *), NULL);
 776                                         }
 777                                 } else {                        /* single pattern matching */
 778                                         /* For each subpattern, insert it into the subpatterns array. */
 779                                         for (i = 0; i < count; i++) {
 780                                                 if (offset_capture) {
 781                                                         add_offset_pair(subpats, (char *)stringlist[i],
 782                                                                                         offsets[(i<<1)+1] - offsets[i<<1],
 783                                                                                         offsets[i<<1], subpat_names[i]);
 784                                                 } else {
 785                                                         if (subpat_names[i]) {
 786                                                                 add_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
 787                                                                                                   offsets[(i<<1)+1] - offsets[i<<1], 1);
 788                                                         }
 789                                                         add_next_index_stringl(subpats, (char *)stringlist[i],
 790                                                                                                    offsets[(i<<1)+1] - offsets[i<<1], 1);
 791                                                 }
 792                                         }
 793                                         /* Add MARK, if available */
 794                                         if (mark) {
 795                                                 add_assoc_string(subpats, "MARK", (char *) mark, 1);
 796                                         }
 797                                 }
 798 
 799                                 pcre_free((void *) stringlist);
 800                         }
 801                 } else if (count == PCRE_ERROR_NOMATCH) {
 802                         /* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
 803                            this is not necessarily the end. We need to advance
 804                            the start offset, and continue. Fudge the offset values
 805                            to achieve this, unless we're already at the end of the string. */
 806                         if (g_notempty != 0 && start_offset < subject_len) {
 807                                 int unit_len = calculate_unit_length(pce, subject + start_offset);
 808                                 
 809                                 offsets[0] = start_offset;
 810                                 offsets[1] = start_offset + unit_len;
 811                         } else
 812                                 break;
 813                 } else {
 814                         pcre_handle_exec_error(count TSRMLS_CC);
 815                         break;
 816                 }
 817 
 818                 /* If we have matched an empty string, mimic what Perl's /g options does.
 819                    This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
 820                    the match again at the same point. If this fails (picked up above) we
 821                    advance to the next character. */
 822                 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
 823                 
 824                 /* Advance to the position right after the last full match */
 825                 start_offset = offsets[1];
 826         } while (global);
 827 
 828         /* Add the match sets to the output array and clean up */
 829         if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
 830                 for (i = 0; i < num_subpats; i++) {
 831                         if (subpat_names[i]) {
 832                                 zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i],
 833                                                                  strlen(subpat_names[i])+1, &match_sets[i], sizeof(zval *), NULL);
 834                                 Z_ADDREF_P(match_sets[i]);
 835                         }
 836                         zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i], sizeof(zval *), NULL);
 837                 }
 838                 efree(match_sets);
 839 
 840                 if (marks) {
 841                         add_assoc_zval(subpats, "MARK", marks);
 842                 }
 843         }
 844 
 845         efree(offsets);
 846         efree(subpat_names);
 847 
 848         /* Did we encounter an error? */
 849         if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
 850                 RETVAL_LONG(matched);
 851         } else {
 852                 RETVAL_FALSE;
 853         }
 854 }
 855 /* }}} */
 856 
 857 /* {{{ proto int preg_match(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
 858    Perform a Perl-style regular expression match */
 859 static PHP_FUNCTION(preg_match)
 860 {
 861         php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
 862 }
 863 /* }}} */
 864 
 865 /* {{{ proto int preg_match_all(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
 866    Perform a Perl-style global regular expression match */
 867 static PHP_FUNCTION(preg_match_all)
 868 {
 869         php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
 870 }
 871 /* }}} */
 872 
 873 /* {{{ preg_get_backref
 874  */
 875 static int preg_get_backref(char **str, int *backref)
 876 {
 877         register char in_brace = 0;
 878         register char *walk = *str;
 879 
 880         if (walk[1] == 0)
 881                 return 0;
 882 
 883         if (*walk == '$' && walk[1] == '{') {
 884                 in_brace = 1;
 885                 walk++;
 886         }
 887         walk++;
 888 
 889         if (*walk >= '0' && *walk <= '9') {
 890                 *backref = *walk - '0';
 891                 walk++;
 892         } else
 893                 return 0;
 894 
 895         if (*walk && *walk >= '0' && *walk <= '9') {
 896                 *backref = *backref * 10 + *walk - '0';
 897                 walk++;
 898         }
 899 
 900         if (in_brace) {
 901                 if (*walk == 0 || *walk != '}')
 902                         return 0;
 903                 else
 904                         walk++;
 905         }
 906 
 907         *str = walk;
 908         return 1;
 909 }
 910 /* }}} */
 911 
 912 /* {{{ preg_do_repl_func
 913  */
 914 static int preg_do_repl_func(zval *function, char *subject, int *offsets, char **subpat_names, int count, unsigned char *mark, char **result TSRMLS_DC)
 915 {
 916         zval            *retval_ptr;            /* Function return value */
 917         zval       **args[1];                   /* Argument to pass to function */
 918         zval            *subpats;                       /* Captured subpatterns */
 919         int                      result_len;            /* Return value length */
 920         int                      i;
 921 
 922         MAKE_STD_ZVAL(subpats);
 923         array_init(subpats);
 924         for (i = 0; i < count; i++) {
 925                 if (subpat_names[i]) {
 926                         add_assoc_stringl(subpats, subpat_names[i], &subject[offsets[i<<1]] , offsets[(i<<1)+1] - offsets[i<<1], 1);
 927                 }
 928                 add_next_index_stringl(subpats, &subject[offsets[i<<1]], offsets[(i<<1)+1] - offsets[i<<1], 1);
 929         }
 930         if (mark) {
 931                 add_assoc_string(subpats, "MARK", (char *) mark, 1);
 932         }
 933         args[0] = &subpats;
 934 
 935         if (call_user_function_ex(EG(function_table), NULL, function, &retval_ptr, 1, args, 0, NULL TSRMLS_CC) == SUCCESS && retval_ptr) {
 936                 convert_to_string_ex(&retval_ptr);
 937                 *result = estrndup(Z_STRVAL_P(retval_ptr), Z_STRLEN_P(retval_ptr));
 938                 result_len = Z_STRLEN_P(retval_ptr);
 939                 zval_ptr_dtor(&retval_ptr);
 940         } else {
 941                 if (!EG(exception)) {
 942                         php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to call custom replacement function");
 943                 }
 944                 result_len = offsets[1] - offsets[0];
 945                 *result = estrndup(&subject[offsets[0]], result_len);
 946         }
 947 
 948         zval_ptr_dtor(&subpats);
 949 
 950         return result_len;
 951 }
 952 /* }}} */
 953 
 954 /* {{{ preg_do_eval
 955  */
 956 static int preg_do_eval(char *eval_str, int eval_str_len, char *subject,
 957                                                 int *offsets, int count, char **result TSRMLS_DC)
 958 {
 959         zval             retval;                        /* Return value from evaluation */
 960         char            *eval_str_end,          /* End of eval string */
 961                                 *match,                         /* Current match for a backref */
 962                                 *esc_match,                     /* Quote-escaped match */
 963                                 *walk,                          /* Used to walk the code string */
 964                                 *segment,                       /* Start of segment to append while walking */
 965                                  walk_last;                     /* Last walked character */
 966         int                      match_len;                     /* Length of the match */
 967         int                      esc_match_len;         /* Length of the quote-escaped match */
 968         int                      result_len;            /* Length of the result of the evaluation */
 969         int                      backref;                       /* Current backref */
 970         char        *compiled_string_description;
 971         smart_str    code = {0};
 972 
 973         eval_str_end = eval_str + eval_str_len;
 974         walk = segment = eval_str;
 975         walk_last = 0;
 976 
 977         while (walk < eval_str_end) {
 978                 /* If found a backreference.. */
 979                 if ('\\' == *walk || '$' == *walk) {
 980                         smart_str_appendl(&code, segment, walk - segment);
 981                         if (walk_last == '\\') {
 982                                 code.c[code.len-1] = *walk++;
 983                                 segment = walk;
 984                                 walk_last = 0;
 985                                 continue;
 986                         }
 987                         segment = walk;
 988                         if (preg_get_backref(&walk, &backref)) {
 989                                 if (backref < count) {
 990                                         /* Find the corresponding string match and substitute it
 991                                            in instead of the backref */
 992                                         match = subject + offsets[backref<<1];
 993                                         match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
 994                                         if (match_len) {
 995                                                 esc_match = php_addslashes(match, match_len, &esc_match_len, 0 TSRMLS_CC);
 996                                         } else {
 997                                                 esc_match = match;
 998                                                 esc_match_len = 0;
 999                                         }
1000                                 } else {
1001                                         esc_match = "";
1002                                         esc_match_len = 0;
1003                                 }
1004                                 smart_str_appendl(&code, esc_match, esc_match_len);
1005 
1006                                 segment = walk;
1007 
1008                                 /* Clean up and reassign */
1009                                 if (esc_match_len)
1010                                         efree(esc_match);
1011                                 continue;
1012                         }
1013                 }
1014                 walk++;
1015                 walk_last = walk[-1];
1016         }
1017         smart_str_appendl(&code, segment, walk - segment);
1018         smart_str_0(&code);
1019 
1020         compiled_string_description = zend_make_compiled_string_description("regexp code" TSRMLS_CC);
1021         /* Run the code */
1022         if (zend_eval_stringl(code.c, code.len, &retval, compiled_string_description TSRMLS_CC) == FAILURE) {
1023                 efree(compiled_string_description);
1024                 php_error_docref(NULL TSRMLS_CC,E_ERROR, "Failed evaluating code: %s%s", PHP_EOL, code.c);
1025                 /* zend_error() does not return in this case */
1026         }
1027         efree(compiled_string_description);
1028         convert_to_string(&retval);
1029 
1030         /* Save the return value and its length */
1031         *result = estrndup(Z_STRVAL(retval), Z_STRLEN(retval));
1032         result_len = Z_STRLEN(retval);
1033 
1034         /* Clean up */
1035         zval_dtor(&retval);
1036         smart_str_free(&code);
1037 
1038         return result_len;
1039 }
1040 /* }}} */
1041 
1042 /* {{{ php_pcre_replace
1043  */
1044 PHPAPI char *php_pcre_replace(char *regex,   int regex_len,
1045                                                           char *subject, int subject_len,
1046                                                           zval *replace_val, int is_callable_replace,
1047                                                           int *result_len, int limit, int *replace_count TSRMLS_DC)
1048 {
1049         pcre_cache_entry        *pce;                       /* Compiled regular expression */
1050         char                            *result;                        /* Function result */
1051 
1052         /* Compile regex or get it from cache. */
1053         if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1054                 return NULL;
1055         }
1056         pce->refcount++;
1057         result = php_pcre_replace_impl(pce, subject, subject_len, replace_val, 
1058                 is_callable_replace, result_len, limit, replace_count TSRMLS_CC);
1059         pce->refcount--;
1060 
1061         return result;
1062 }
1063 /* }}} */
1064 
1065 /* {{{ php_pcre_replace_impl() */
1066 PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *replace_val,
1067         int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC)
1068 {
1069         pcre_extra              *extra = pce->extra;/* Holds results of studying */
1070         pcre_extra               extra_data;            /* Used locally for exec options */
1071         int                              exoptions = 0;         /* Execution options */
1072         int                              count = 0;                     /* Count of matched subpatterns */
1073         int                             *offsets;                       /* Array of subpattern offsets */
1074         char                    **subpat_names;         /* Array for named subpatterns */
1075         int                              num_subpats;           /* Number of captured subpatterns */
1076         int                              size_offsets;          /* Size of the offsets array */
1077         int                              new_len;                       /* Length of needed storage */
1078         int                              alloc_len;                     /* Actual allocated length */
1079         int                              eval_result_len=0;     /* Length of the eval'ed or
1080                                                                                    function-returned string */
1081         int                              match_len;                     /* Length of the current match */
1082         int                              backref;                       /* Backreference number */
1083         int                              eval;                          /* If the replacement string should be eval'ed */
1084         int                              start_offset;          /* Where the new search starts */
1085         int                              g_notempty=0;          /* If the match should not be empty */
1086         int                              replace_len=0;         /* Length of replacement string */
1087         char                    *result,                        /* Result of replacement */
1088                                         *replace=NULL,          /* Replacement string */
1089                                         *new_buf,                       /* Temporary buffer for re-allocation */
1090                                         *walkbuf,                       /* Location of current replacement in the result */
1091                                         *walk,                          /* Used to walk the replacement string */
1092                                         *match,                         /* The current match */
1093                                         *piece,                         /* The current piece of subject */
1094                                         *replace_end=NULL,      /* End of replacement string */
1095                                         *eval_result,           /* Result of eval or custom function */
1096                                          walk_last;                     /* Last walked character */
1097         int                              rc;
1098         unsigned char   *mark = NULL;       /* Target for MARK name */
1099 
1100         if (extra == NULL) {
1101                 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1102                 extra = &extra_data;
1103         }
1104         extra->match_limit = PCRE_G(backtrack_limit);
1105         extra->match_limit_recursion = PCRE_G(recursion_limit);
1106 #ifdef PCRE_EXTRA_MARK
1107         extra->mark = &mark;
1108         extra->flags |= PCRE_EXTRA_MARK;
1109 #endif
1110 
1111         eval = pce->preg_options & PREG_REPLACE_EVAL;
1112         if (is_callable_replace) {
1113                 if (eval) {
1114                         php_error_docref(NULL TSRMLS_CC, E_WARNING, "Modifier /e cannot be used with replacement callback");
1115                         return NULL;
1116                 }
1117         } else {
1118                 replace = Z_STRVAL_P(replace_val);
1119                 replace_len = Z_STRLEN_P(replace_val);
1120                 replace_end = replace + replace_len;
1121         }
1122 
1123         if (eval) {
1124                 php_error_docref(NULL TSRMLS_CC, E_DEPRECATED, "The /e modifier is deprecated, use preg_replace_callback instead");
1125         }
1126 
1127         /* Calculate the size of the offsets array, and allocate memory for it. */
1128         rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
1129         if (rc < 0) {
1130                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1131                 return NULL;
1132         }
1133         num_subpats++;
1134         size_offsets = num_subpats * 3;
1135 
1136         /*
1137          * Build a mapping from subpattern numbers to their names. We will always
1138          * allocate the table, even though there may be no named subpatterns. This
1139          * avoids somewhat more complicated logic in the inner loops.
1140          */
1141         subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
1142         if (!subpat_names) {
1143                 return NULL;
1144         }
1145 
1146         offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1147 
1148         alloc_len = 2 * subject_len + 1;
1149         result = safe_emalloc(alloc_len, sizeof(char), 0);
1150 
1151         /* Initialize */
1152         match = NULL;
1153         *result_len = 0;
1154         start_offset = 0;
1155         PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1156 
1157         while (1) {
1158                 /* Execute the regular expression. */
1159                 count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
1160                                                   exoptions|g_notempty, offsets, size_offsets);
1161 
1162                 /* the string was already proved to be valid UTF-8 */
1163                 exoptions |= PCRE_NO_UTF8_CHECK;
1164 
1165                 /* Check for too many substrings condition. */
1166                 if (count == 0) {
1167                         php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
1168                         count = size_offsets/3;
1169                 }
1170 
1171                 piece = subject + start_offset;
1172 
1173                 if (count > 0 && (offsets[1] - offsets[0] >= 0) && (limit == -1 || limit > 0)) {
1174                         if (replace_count) {
1175                                 ++*replace_count;
1176                         }
1177                         /* Set the match location in subject */
1178                         match = subject + offsets[0];
1179 
1180                         new_len = *result_len + offsets[0] - start_offset; /* part before the match */
1181 
1182                         /* If evaluating, do it and add the return string's length */
1183                         if (eval) {
1184                                 eval_result_len = preg_do_eval(replace, replace_len, subject,
1185                                                                                            offsets, count, &eval_result TSRMLS_CC);
1186                                 new_len += eval_result_len;
1187                         } else if (is_callable_replace) {
1188                                 /* Use custom function to get replacement string and its length. */
1189                                 eval_result_len = preg_do_repl_func(replace_val, subject, offsets, subpat_names, count, mark, &eval_result TSRMLS_CC);
1190                                 new_len += eval_result_len;
1191                         } else { /* do regular substitution */
1192                                 walk = replace;
1193                                 walk_last = 0;
1194                                 while (walk < replace_end) {
1195                                         if ('\\' == *walk || '$' == *walk) {
1196                                                 if (walk_last == '\\') {
1197                                                         walk++;
1198                                                         walk_last = 0;
1199                                                         continue;
1200                                                 }
1201                                                 if (preg_get_backref(&walk, &backref)) {
1202                                                         if (backref < count)
1203                                                                 new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
1204                                                         continue;
1205                                                 }
1206                                         }
1207                                         new_len++;
1208                                         walk++;
1209                                         walk_last = walk[-1];
1210                                 }
1211                         }
1212 
1213                         if (new_len + 1 > alloc_len) {
1214                                 alloc_len = 1 + alloc_len + 2 * new_len;
1215                                 new_buf = emalloc(alloc_len);
1216                                 memcpy(new_buf, result, *result_len);
1217                                 efree(result);
1218                                 result = new_buf;
1219                         }
1220                         /* copy the part of the string before the match */
1221                         memcpy(&result[*result_len], piece, match-piece);
1222                         *result_len += match-piece;
1223 
1224                         /* copy replacement and backrefs */
1225                         walkbuf = result + *result_len;
1226 
1227                         /* If evaluating or using custom function, copy result to the buffer
1228                          * and clean up. */
1229                         if (eval || is_callable_replace) {
1230                                 memcpy(walkbuf, eval_result, eval_result_len);
1231                                 *result_len += eval_result_len;
1232                                 STR_FREE(eval_result);
1233                         } else { /* do regular backreference copying */
1234                                 walk = replace;
1235                                 walk_last = 0;
1236                                 while (walk < replace_end) {
1237                                         if ('\\' == *walk || '$' == *walk) {
1238                                                 if (walk_last == '\\') {
1239                                                         *(walkbuf-1) = *walk++;
1240                                                         walk_last = 0;
1241                                                         continue;
1242                                                 }
1243                                                 if (preg_get_backref(&walk, &backref)) {
1244                                                         if (backref < count) {
1245                                                                 match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1246                                                                 memcpy(walkbuf, subject + offsets[backref<<1], match_len);
1247                                                                 walkbuf += match_len;
1248                                                         }
1249                                                         continue;
1250                                                 }
1251                                         }
1252                                         *walkbuf++ = *walk++;
1253                                         walk_last = walk[-1];
1254                                 }
1255                                 *walkbuf = '\0';
1256                                 /* increment the result length by how much we've added to the string */
1257                                 *result_len += walkbuf - (result + *result_len);
1258                         }
1259 
1260                         if (limit != -1)
1261                                 limit--;
1262 
1263                 } else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
1264                         /* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
1265                            this is not necessarily the end. We need to advance
1266                            the start offset, and continue. Fudge the offset values
1267                            to achieve this, unless we're already at the end of the string. */
1268                         if (g_notempty != 0 && start_offset < subject_len) {
1269                                 int unit_len = calculate_unit_length(pce, piece);
1270 
1271                                 offsets[0] = start_offset;
1272                                 offsets[1] = start_offset + unit_len;
1273                                 memcpy(&result[*result_len], piece, unit_len);
1274                                 *result_len += unit_len;
1275                         } else {
1276                                 new_len = *result_len + subject_len - start_offset;
1277                                 if (new_len + 1 > alloc_len) {
1278                                         alloc_len = new_len + 1; /* now we know exactly how long it is */
1279                                         new_buf = safe_emalloc(alloc_len, sizeof(char), 0);
1280                                         memcpy(new_buf, result, *result_len);
1281                                         efree(result);
1282                                         result = new_buf;
1283                                 }
1284                                 /* stick that last bit of string on our output */
1285                                 memcpy(&result[*result_len], piece, subject_len - start_offset);
1286                                 *result_len += subject_len - start_offset;
1287                                 result[*result_len] = '\0';
1288                                 break;
1289                         }
1290                 } else {
1291                         pcre_handle_exec_error(count TSRMLS_CC);
1292                         efree(result);
1293                         result = NULL;
1294                         break;
1295                 }
1296 
1297                 /* If we have matched an empty string, mimic what Perl's /g options does.
1298                    This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
1299                    the match again at the same point. If this fails (picked up above) we
1300                    advance to the next character. */
1301                 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
1302                 
1303                 /* Advance to the next piece. */
1304                 start_offset = offsets[1];
1305         }
1306 
1307         efree(offsets);
1308         efree(subpat_names);
1309 
1310         return result;
1311 }
1312 /* }}} */
1313 
1314 /* {{{ php_replace_in_subject
1315  */
1316 static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject, int *result_len, int limit, int is_callable_replace, int *replace_count TSRMLS_DC)
1317 {
1318         zval            **regex_entry,
1319                                 **replace_entry = NULL,
1320                                  *replace_value,
1321                                   empty_replace;
1322         char            *subject_value,
1323                                 *result;
1324         int                      subject_len;
1325 
1326         /* Make sure we're dealing with strings. */
1327         convert_to_string_ex(subject);
1328         /* FIXME: This might need to be changed to STR_EMPTY_ALLOC(). Check if this zval could be dtor()'ed somehow */
1329         ZVAL_STRINGL(&empty_replace, "", 0, 0);
1330 
1331         /* If regex is an array */
1332         if (Z_TYPE_P(regex) == IS_ARRAY) {
1333                 /* Duplicate subject string for repeated replacement */
1334                 subject_value = estrndup(Z_STRVAL_PP(subject), Z_STRLEN_PP(subject));
1335                 subject_len = Z_STRLEN_PP(subject);
1336                 *result_len = subject_len;
1337 
1338                 zend_hash_internal_pointer_reset(Z_ARRVAL_P(regex));
1339 
1340                 replace_value = replace;
1341                 if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace)
1342                         zend_hash_internal_pointer_reset(Z_ARRVAL_P(replace));
1343 
1344                 /* For each entry in the regex array, get the entry */
1345                 while (zend_hash_get_current_data(Z_ARRVAL_P(regex), (void **)&regex_entry) == SUCCESS) {
1346                         /* Make sure we're dealing with strings. */
1347                         convert_to_string_ex(regex_entry);
1348 
1349                         /* If replace is an array and not a callable construct */
1350                         if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace) {
1351                                 /* Get current entry */
1352                                 if (zend_hash_get_current_data(Z_ARRVAL_P(replace), (void **)&replace_entry) == SUCCESS) {
1353                                         if (!is_callable_replace) {
1354                                                 convert_to_string_ex(replace_entry);
1355                                         }
1356                                         replace_value = *replace_entry;
1357                                         zend_hash_move_forward(Z_ARRVAL_P(replace));
1358                                 } else {
1359                                         /* We've run out of replacement strings, so use an empty one */
1360                                         replace_value = &empty_replace;
1361                                 }
1362                         }
1363 
1364                         /* Do the actual replacement and put the result back into subject_value
1365                            for further replacements. */
1366                         if ((result = php_pcre_replace(Z_STRVAL_PP(regex_entry),
1367                                                                                    Z_STRLEN_PP(regex_entry),
1368                                                                                    subject_value,
1369                                                                                    subject_len,
1370                                                                                    replace_value,
1371                                                                                    is_callable_replace,
1372                                                                                    result_len,
1373                                                                                    limit,
1374                                                                                    replace_count TSRMLS_CC)) != NULL) {
1375                                 efree(subject_value);
1376                                 subject_value = result;
1377                                 subject_len = *result_len;
1378                         } else {
1379                                 efree(subject_value);
1380                                 return NULL;
1381                         }
1382 
1383                         zend_hash_move_forward(Z_ARRVAL_P(regex));
1384                 }
1385 
1386                 return subject_value;
1387         } else {
1388                 result = php_pcre_replace(Z_STRVAL_P(regex),
1389                                                                   Z_STRLEN_P(regex),
1390                                                                   Z_STRVAL_PP(subject),
1391                                                                   Z_STRLEN_PP(subject),
1392                                                                   replace,
1393                                                                   is_callable_replace,
1394                                                                   result_len,
1395                                                                   limit,
1396                                                                   replace_count TSRMLS_CC);
1397                 return result;
1398         }
1399 }
1400 /* }}} */
1401 
1402 /* {{{ preg_replace_impl
1403  */
1404 static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, int is_callable_replace, int is_filter)
1405 {
1406         zval               **regex,
1407                                    **replace,
1408                                    **subject,
1409                                    **subject_entry,
1410                                    **zcount = NULL;
1411         char                    *result;
1412         int                              result_len;
1413         int                              limit_val = -1;
1414         long                    limit = -1;
1415         char                    *string_key;
1416         uint                     string_key_len;
1417         ulong                    num_key;
1418         char                    *callback_name;
1419         int                              replace_count=0, old_replace_count;
1420 
1421         /* Get function parameters and do error-checking. */
1422         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ZZZ|lZ", &regex, &replace, &subject, &limit, &zcount) == FAILURE) {
1423                 return;
1424         }
1425 
1426         if (!is_callable_replace && Z_TYPE_PP(replace) == IS_ARRAY && Z_TYPE_PP(regex) != IS_ARRAY) {
1427                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Parameter mismatch, pattern is a string while replacement is an array");
1428                 RETURN_FALSE;
1429         }
1430 
1431         SEPARATE_ZVAL(replace);
1432         if (Z_TYPE_PP(replace) != IS_ARRAY && (Z_TYPE_PP(replace) != IS_OBJECT || !is_callable_replace)) {
1433                 convert_to_string_ex(replace);
1434         }
1435         if (is_callable_replace) {
1436                 if (!zend_is_callable(*replace, 0, &callback_name TSRMLS_CC)) {
1437                         php_error_docref(NULL TSRMLS_CC, E_WARNING, "Requires argument 2, '%s', to be a valid callback", callback_name);
1438                         efree(callback_name);
1439                         MAKE_COPY_ZVAL(subject, return_value);
1440                         return;
1441                 }
1442                 efree(callback_name);
1443         }
1444 
1445         SEPARATE_ZVAL(regex);
1446         SEPARATE_ZVAL(subject);
1447 
1448         if (ZEND_NUM_ARGS() > 3) {
1449                 limit_val = limit;
1450         }
1451 
1452         if (Z_TYPE_PP(regex) != IS_ARRAY)
1453                 convert_to_string_ex(regex);
1454 
1455         /* if subject is an array */
1456         if (Z_TYPE_PP(subject) == IS_ARRAY) {
1457                 array_init(return_value);
1458                 zend_hash_internal_pointer_reset(Z_ARRVAL_PP(subject));
1459 
1460                 /* For each subject entry, convert it to string, then perform replacement
1461                    and add the result to the return_value array. */
1462                 while (zend_hash_get_current_data(Z_ARRVAL_PP(subject), (void **)&subject_entry) == SUCCESS) {
1463                         SEPARATE_ZVAL(subject_entry);
1464                         old_replace_count = replace_count;
1465                         if ((result = php_replace_in_subject(*regex, *replace, subject_entry, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
1466                                 if (!is_filter || replace_count > old_replace_count) {
1467                                         /* Add to return array */
1468                                         switch(zend_hash_get_current_key_ex(Z_ARRVAL_PP(subject), &string_key, &string_key_len, &num_key, 0, NULL))
1469                                         {
1470                                         case HASH_KEY_IS_STRING:
1471                                                 add_assoc_stringl_ex(return_value, string_key, string_key_len, result, result_len, 0);
1472                                                 break;
1473 
1474                                         case HASH_KEY_IS_LONG:
1475                                                 add_index_stringl(return_value, num_key, result, result_len, 0);
1476                                                 break;
1477                                         }
1478                                 } else {
1479                                         efree(result);
1480                                 }
1481                         }
1482 
1483                         zend_hash_move_forward(Z_ARRVAL_PP(subject));
1484                 }
1485         } else {        /* if subject is not an array */
1486                 old_replace_count = replace_count;
1487                 if ((result = php_replace_in_subject(*regex, *replace, subject, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
1488                         if (!is_filter || replace_count > old_replace_count) {
1489                                 RETVAL_STRINGL(result, result_len, 0);
1490                         } else {
1491                                 efree(result);
1492                         }
1493                 }
1494         }
1495         if (ZEND_NUM_ARGS() > 4) {
1496                 zval_dtor(*zcount);
1497                 ZVAL_LONG(*zcount, replace_count);
1498         }
1499 
1500 }
1501 /* }}} */
1502 
1503 /* {{{ proto mixed preg_replace(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1504    Perform Perl-style regular expression replacement. */
1505 static PHP_FUNCTION(preg_replace)
1506 {
1507         preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
1508 }
1509 /* }}} */
1510 
1511 /* {{{ proto mixed preg_replace_callback(mixed regex, mixed callback, mixed subject [, int limit [, int &count]])
1512    Perform Perl-style regular expression replacement using replacement callback. */
1513 static PHP_FUNCTION(preg_replace_callback)
1514 {
1515         preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1, 0);
1516 }
1517 /* }}} */
1518 
1519 /* {{{ proto mixed preg_filter(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1520    Perform Perl-style regular expression replacement and only return matches. */
1521 static PHP_FUNCTION(preg_filter)
1522 {
1523         preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
1524 }
1525 /* }}} */
1526 
1527 /* {{{ proto array preg_split(string pattern, string subject [, int limit [, int flags]])
1528    Split string into an array using a perl-style regular expression as a delimiter */
1529 static PHP_FUNCTION(preg_split)
1530 {
1531         char                            *regex;                 /* Regular expression */
1532         char                            *subject;               /* String to match against */
1533         int                                      regex_len;
1534         int                                      subject_len;
1535         long                             limit_val = -1;/* Integer value of limit */
1536         long                             flags = 0;             /* Match control flags */
1537         pcre_cache_entry        *pce;                   /* Compiled regular expression */
1538 
1539         /* Get function parameters and do error checking */
1540         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|ll", &regex, &regex_len,
1541                                                           &subject, &subject_len, &limit_val, &flags) == FAILURE) {
1542                 RETURN_FALSE;
1543         }
1544 
1545         /* Compile regex or get it from cache. */
1546         if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1547                 RETURN_FALSE;
1548         }
1549 
1550         pce->refcount++;
1551         php_pcre_split_impl(pce, subject, subject_len, return_value, limit_val, flags TSRMLS_CC);
1552         pce->refcount--;
1553 }
1554 /* }}} */
1555 
1556 /* {{{ php_pcre_split
1557  */
1558 PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
1559         long limit_val, long flags TSRMLS_DC)
1560 {
1561         pcre_extra              *extra = NULL;          /* Holds results of studying */
1562         pcre                    *re_bump = NULL;        /* Regex instance for empty matches */
1563         pcre_extra              *extra_bump = NULL;     /* Almost dummy */
1564         pcre_extra               extra_data;            /* Used locally for exec options */
1565         int                             *offsets;                       /* Array of subpattern offsets */
1566         int                              size_offsets;          /* Size of the offsets array */
1567         int                              exoptions = 0;         /* Execution options */
1568         int                              count = 0;                     /* Count of matched subpatterns */
1569         int                              start_offset;          /* Where the new search starts */
1570         int                              next_offset;           /* End of the last delimiter match + 1 */
1571         int                              g_notempty = 0;        /* If the match should not be empty */
1572         char                    *last_match;            /* Location of last match */
1573         int                              rc;
1574         int                              no_empty;                      /* If NO_EMPTY flag is set */
1575         int                              delim_capture;         /* If delimiters should be captured */
1576         int                              offset_capture;        /* If offsets should be captured */
1577 
1578         no_empty = flags & PREG_SPLIT_NO_EMPTY;
1579         delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1580         offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1581 
1582         if (limit_val == 0) {
1583                 limit_val = -1;
1584         }
1585 
1586         if (extra == NULL) {
1587                 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1588                 extra = &extra_data;
1589         }
1590         extra->match_limit = PCRE_G(backtrack_limit);
1591         extra->match_limit_recursion = PCRE_G(recursion_limit);
1592 #ifdef PCRE_EXTRA_MARK
1593         extra->flags &= ~PCRE_EXTRA_MARK;
1594 #endif
1595         
1596         /* Initialize return value */
1597         array_init(return_value);
1598 
1599         /* Calculate the size of the offsets array, and allocate memory for it. */
1600         rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
1601         if (rc < 0) {
1602                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1603                 RETURN_FALSE;
1604         }
1605         size_offsets = (size_offsets + 1) * 3;
1606         offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1607 
1608         /* Start at the beginning of the string */
1609         start_offset = 0;
1610         next_offset = 0;
1611         last_match = subject;
1612         PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1613 
1614         /* Get next piece if no limit or limit not yet reached and something matched*/
1615         while ((limit_val == -1 || limit_val > 1)) {
1616                 count = pcre_exec(pce->re, extra, subject,
1617                                                   subject_len, start_offset,
1618                                                   exoptions|g_notempty, offsets, size_offsets);
1619 
1620                 /* the string was already proved to be valid UTF-8 */
1621                 exoptions |= PCRE_NO_UTF8_CHECK;
1622 
1623                 /* Check for too many substrings condition. */
1624                 if (count == 0) {
1625                         php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
1626                         count = size_offsets/3;
1627                 }
1628 
1629                 /* If something matched */
1630                 if (count > 0 && (offsets[1] - offsets[0] >= 0)) {
1631                         if (!no_empty || &subject[offsets[0]] != last_match) {
1632 
1633                                 if (offset_capture) {
1634                                         /* Add (match, offset) pair to the return value */
1635                                         add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL);
1636                                 } else {
1637                                         /* Add the piece to the return value */
1638                                         add_next_index_stringl(return_value, last_match,
1639                                                                            &subject[offsets[0]]-last_match, 1);
1640                                 }
1641 
1642                                 /* One less left to do */
1643                                 if (limit_val != -1)
1644                                         limit_val--;
1645                         }
1646 
1647                         last_match = &subject[offsets[1]];
1648                         next_offset = offsets[1];
1649 
1650                         if (delim_capture) {
1651                                 int i, match_len;
1652                                 for (i = 1; i < count; i++) {
1653                                         match_len = offsets[(i<<1)+1] - offsets[i<<1];
1654                                         /* If we have matched a delimiter */
1655                                         if (!no_empty || match_len > 0) {
1656                                                 if (offset_capture) {
1657                                                         add_offset_pair(return_value, &subject[offsets[i<<1]], match_len, offsets[i<<1], NULL);
1658                                                 } else {
1659                                                         add_next_index_stringl(return_value,
1660                                                                                                    &subject[offsets[i<<1]],
1661                                                                                                    match_len, 1);
1662                                                 }
1663                                         }
1664                                 }
1665                         }
1666                 } else if (count == PCRE_ERROR_NOMATCH) {
1667                         /* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
1668                            this is not necessarily the end. We need to advance
1669                            the start offset, and continue. Fudge the offset values
1670                            to achieve this, unless we're already at the end of the string. */
1671                         if (g_notempty != 0 && start_offset < subject_len) {
1672                                 if (pce->compile_options & PCRE_UTF8) {
1673                                         if (re_bump == NULL) {
1674                                                 int dummy;
1675 
1676                                                 if ((re_bump = pcre_get_compiled_regex("/./us", &extra_bump, &dummy TSRMLS_CC)) == NULL) {
1677                                                         RETURN_FALSE;
1678                                                 }
1679                                         }
1680                                         count = pcre_exec(re_bump, extra_bump, subject,
1681                                                           subject_len, start_offset,
1682                                                           exoptions, offsets, size_offsets);
1683                                         if (count < 1) {
1684                                                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown error");
1685                                                 RETURN_FALSE;
1686                                         }
1687                                 } else {
1688                                         offsets[0] = start_offset;
1689                                         offsets[1] = start_offset + 1;
1690                                 }
1691                         } else
1692                                 break;
1693                 } else {
1694                         pcre_handle_exec_error(count TSRMLS_CC);
1695                         break;
1696                 }
1697 
1698                 /* If we have matched an empty string, mimic what Perl's /g options does.
1699                    This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
1700                    the match again at the same point. If this fails (picked up above) we
1701                    advance to the next character. */
1702                 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
1703                 
1704                 /* Advance to the position right after the last full match */
1705                 start_offset = offsets[1];
1706         }
1707 
1708 
1709         start_offset = last_match - subject; /* the offset might have been incremented, but without further successful matches */
1710 
1711         if (!no_empty || start_offset < subject_len)
1712         {
1713                 if (offset_capture) {
1714                         /* Add the last (match, offset) pair to the return value */
1715                         add_offset_pair(return_value, &subject[start_offset], subject_len - start_offset, start_offset, NULL);
1716                 } else {
1717                         /* Add the last piece to the return value */
1718                         add_next_index_stringl(return_value, last_match, subject + subject_len - last_match, 1);
1719                 }
1720         }
1721 
1722 
1723         /* Clean up */
1724         efree(offsets);
1725 }
1726 /* }}} */
1727 
1728 /* {{{ proto string preg_quote(string str [, string delim_char])
1729    Quote regular expression characters plus an optional character */
1730 static PHP_FUNCTION(preg_quote)
1731 {
1732         int              in_str_len;
1733         char    *in_str;                /* Input string argument */
1734         char    *in_str_end;    /* End of the input string */
1735         int              delim_len = 0;
1736         char    *delim = NULL;  /* Additional delimiter argument */
1737         char    *out_str,               /* Output string with quoted characters */
1738                         *p,                             /* Iterator for input string */
1739                         *q,                             /* Iterator for output string */
1740                          delim_char=0,  /* Delimiter character to be quoted */
1741                          c;                             /* Current character */
1742         zend_bool quote_delim = 0; /* Whether to quote additional delim char */
1743 
1744         /* Get the arguments and check for errors */
1745         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|s", &in_str, &in_str_len,
1746                                                           &delim, &delim_len) == FAILURE) {
1747                 return;
1748         }
1749 
1750         in_str_end = in_str + in_str_len;
1751 
1752         /* Nothing to do if we got an empty string */
1753         if (in_str == in_str_end) {
1754                 RETURN_EMPTY_STRING();
1755         }
1756 
1757         if (delim && *delim) {
1758                 delim_char = delim[0];
1759                 quote_delim = 1;
1760         }
1761 
1762         /* Allocate enough memory so that even if each character
1763            is quoted, we won't run out of room */
1764         out_str = safe_emalloc(4, in_str_len, 1);
1765 
1766         /* Go through the string and quote necessary characters */
1767         for(p = in_str, q = out_str; p != in_str_end; p++) {
1768                 c = *p;
1769                 switch(c) {
1770                         case '.':
1771                         case '\\':
1772                         case '+':
1773                         case '*':
1774                         case '?':
1775                         case '[':
1776                         case '^':
1777                         case ']':
1778                         case '$':
1779                         case '(':
1780                         case ')':
1781                         case '{':
1782                         case '}':
1783                         case '=':
1784                         case '!':
1785                         case '>':
1786                         case '<':
1787                         case '|':
1788                         case ':':
1789                         case '-':
1790                                 *q++ = '\\';
1791                                 *q++ = c;
1792                                 break;
1793 
1794                         case '\0':
1795                                 *q++ = '\\';
1796                                 *q++ = '0';
1797                                 *q++ = '0';
1798                                 *q++ = '0';
1799                                 break;
1800 
1801                         default:
1802                                 if (quote_delim && c == delim_char)
1803                                         *q++ = '\\';
1804                                 *q++ = c;
1805                                 break;
1806                 }
1807         }
1808         *q = '\0';
1809 
1810         /* Reallocate string and return it */
1811         RETVAL_STRINGL(erealloc(out_str, q - out_str + 1), q - out_str, 0);
1812 }
1813 /* }}} */
1814 
1815 /* {{{ proto array preg_grep(string regex, array input [, int flags])
1816    Searches array and returns entries which match regex */
1817 static PHP_FUNCTION(preg_grep)
1818 {
1819         char                            *regex;                 /* Regular expression */
1820         int                                      regex_len;
1821         zval                            *input;                 /* Input array */
1822         long                             flags = 0;             /* Match control flags */
1823         pcre_cache_entry        *pce;                   /* Compiled regular expression */
1824 
1825         /* Get arguments and do error checking */
1826         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sa|l", &regex, &regex_len,
1827                                                           &input, &flags) == FAILURE) {
1828                 return;
1829         }
1830 
1831         /* Compile regex or get it from cache. */
1832         if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1833                 RETURN_FALSE;
1834         }
1835 
1836         pce->refcount++;
1837         php_pcre_grep_impl(pce, input, return_value, flags TSRMLS_CC);
1838         pce->refcount--;
1839 }
1840 /* }}} */
1841 
1842 PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, long flags TSRMLS_DC) /* {{{ */
1843 {
1844         zval               **entry;                             /* An entry in the input array */
1845         pcre_extra              *extra = pce->extra;/* Holds results of studying */
1846         pcre_extra               extra_data;            /* Used locally for exec options */
1847         int                             *offsets;                       /* Array of subpattern offsets */
1848         int                              size_offsets;          /* Size of the offsets array */
1849         int                              count = 0;                     /* Count of matched subpatterns */
1850         char                    *string_key;
1851         uint                     string_key_len;
1852         ulong                    num_key;
1853         zend_bool                invert;                        /* Whether to return non-matching
1854                                                                                    entries */
1855         int                              rc;
1856 
1857         invert = flags & PREG_GREP_INVERT ? 1 : 0;
1858 
1859         if (extra == NULL) {
1860                 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1861                 extra = &extra_data;
1862         }
1863         extra->match_limit = PCRE_G(backtrack_limit);
1864         extra->match_limit_recursion = PCRE_G(recursion_limit);
1865 #ifdef PCRE_EXTRA_MARK
1866         extra->flags &= ~PCRE_EXTRA_MARK;
1867 #endif
1868 
1869         /* Calculate the size of the offsets array, and allocate memory for it. */
1870         rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
1871         if (rc < 0) {
1872                 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1873                 RETURN_FALSE;
1874         }
1875         size_offsets = (size_offsets + 1) * 3;
1876         offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1877 
1878         /* Initialize return array */
1879         array_init(return_value);
1880 
1881         PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1882 
1883         /* Go through the input array */
1884         zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
1885         while (zend_hash_get_current_data(Z_ARRVAL_P(input), (void **)&entry) == SUCCESS) {
1886                 zval subject = **entry;
1887 
1888                 if (Z_TYPE_PP(entry) != IS_STRING) {
1889                         zval_copy_ctor(&subject);
1890                         convert_to_string(&subject);
1891                 }
1892 
1893                 /* Perform the match */
1894                 count = pcre_exec(pce->re, extra, Z_STRVAL(subject),
1895                                                   Z_STRLEN(subject), 0,
1896                                                   0, offsets, size_offsets);
1897 
1898                 /* Check for too many substrings condition. */
1899                 if (count == 0) {
1900                         php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
1901                         count = size_offsets/3;
1902                 } else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
1903                         pcre_handle_exec_error(count TSRMLS_CC);
1904                         break;
1905                 }
1906 
1907                 /* If the entry fits our requirements */
1908                 if ((count > 0 && !invert) || (count == PCRE_ERROR_NOMATCH && invert)) {
1909 
1910                         Z_ADDREF_PP(entry);
1911 
1912                         /* Add to return array */
1913                         switch (zend_hash_get_current_key_ex(Z_ARRVAL_P(input), &string_key, &string_key_len, &num_key, 0, NULL))
1914                         {
1915                                 case HASH_KEY_IS_STRING:
1916                                         zend_hash_update(Z_ARRVAL_P(return_value), string_key,
1917                                                                          string_key_len, entry, sizeof(zval *), NULL);
1918                                         break;
1919 
1920                                 case HASH_KEY_IS_LONG:
1921                                         zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry,
1922                                                                                    sizeof(zval *), NULL);
1923                                         break;
1924                         }
1925                 }
1926 
1927                 if (Z_TYPE_PP(entry) != IS_STRING) {
1928                         zval_dtor(&subject);
1929                 }
1930 
1931                 zend_hash_move_forward(Z_ARRVAL_P(input));
1932         }
1933         zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
1934         /* Clean up */
1935         efree(offsets);
1936 }
1937 /* }}} */
1938 
1939 /* {{{ proto int preg_last_error()
1940    Returns the error code of the last regexp execution. */
1941 static PHP_FUNCTION(preg_last_error)
1942 {
1943         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) {
1944                 return;
1945         }
1946 
1947         RETURN_LONG(PCRE_G(error_code));
1948 }
1949 /* }}} */
1950 
1951 /* {{{ module definition structures */
1952 
1953 /* {{{ arginfo */
1954 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match, 0, 0, 2)
1955     ZEND_ARG_INFO(0, pattern)
1956     ZEND_ARG_INFO(0, subject)
1957     ZEND_ARG_INFO(1, subpatterns) /* array */
1958     ZEND_ARG_INFO(0, flags)
1959     ZEND_ARG_INFO(0, offset)
1960 ZEND_END_ARG_INFO()
1961 
1962 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match_all, 0, 0, 2)
1963     ZEND_ARG_INFO(0, pattern)
1964     ZEND_ARG_INFO(0, subject)
1965     ZEND_ARG_INFO(1, subpatterns) /* array */
1966     ZEND_ARG_INFO(0, flags)
1967     ZEND_ARG_INFO(0, offset)
1968 ZEND_END_ARG_INFO()
1969 
1970 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace, 0, 0, 3)
1971     ZEND_ARG_INFO(0, regex)
1972     ZEND_ARG_INFO(0, replace)
1973     ZEND_ARG_INFO(0, subject)
1974     ZEND_ARG_INFO(0, limit)
1975     ZEND_ARG_INFO(1, count)
1976 ZEND_END_ARG_INFO()
1977 
1978 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace_callback, 0, 0, 3)
1979     ZEND_ARG_INFO(0, regex)
1980     ZEND_ARG_INFO(0, callback)
1981     ZEND_ARG_INFO(0, subject)
1982     ZEND_ARG_INFO(0, limit)
1983     ZEND_ARG_INFO(1, count)
1984 ZEND_END_ARG_INFO()
1985 
1986 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_split, 0, 0, 2)
1987     ZEND_ARG_INFO(0, pattern)
1988     ZEND_ARG_INFO(0, subject)
1989     ZEND_ARG_INFO(0, limit)
1990     ZEND_ARG_INFO(0, flags)
1991 ZEND_END_ARG_INFO()
1992 
1993 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_quote, 0, 0, 1)
1994     ZEND_ARG_INFO(0, str)
1995     ZEND_ARG_INFO(0, delim_char)
1996 ZEND_END_ARG_INFO()
1997 
1998 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_grep, 0, 0, 2)
1999     ZEND_ARG_INFO(0, regex)
2000     ZEND_ARG_INFO(0, input) /* array */
2001     ZEND_ARG_INFO(0, flags)
2002 ZEND_END_ARG_INFO()
2003 
2004 ZEND_BEGIN_ARG_INFO(arginfo_preg_last_error, 0)
2005 ZEND_END_ARG_INFO()
2006 /* }}} */
2007 
2008 static const zend_function_entry pcre_functions[] = {
2009         PHP_FE(preg_match,                              arginfo_preg_match)
2010         PHP_FE(preg_match_all,                  arginfo_preg_match_all)
2011         PHP_FE(preg_replace,                    arginfo_preg_replace)
2012         PHP_FE(preg_replace_callback,   arginfo_preg_replace_callback)
2013         PHP_FE(preg_filter,                             arginfo_preg_replace)
2014         PHP_FE(preg_split,                              arginfo_preg_split)
2015         PHP_FE(preg_quote,                              arginfo_preg_quote)
2016         PHP_FE(preg_grep,                               arginfo_preg_grep)
2017         PHP_FE(preg_last_error,                 arginfo_preg_last_error)
2018         PHP_FE_END
2019 };
2020 
2021 zend_module_entry pcre_module_entry = {
2022         STANDARD_MODULE_HEADER,
2023    "pcre",
2024         pcre_functions,
2025         PHP_MINIT(pcre),
2026         PHP_MSHUTDOWN(pcre),
2027         NULL,
2028         NULL,
2029         PHP_MINFO(pcre),
2030         NO_VERSION_YET,
2031         PHP_MODULE_GLOBALS(pcre),
2032         PHP_GINIT(pcre),
2033         PHP_GSHUTDOWN(pcre),
2034         NULL,
2035         STANDARD_MODULE_PROPERTIES_EX
2036 };
2037 
2038 #ifdef COMPILE_DL_PCRE
2039 ZEND_GET_MODULE(pcre)
2040 #endif
2041 
2042 /* }}} */
2043 
2044 #endif /* HAVE_PCRE || HAVE_BUNDLED_PCRE */
2045 
2046 /*
2047  * Local variables:
2048  * tab-width: 4
2049  * c-basic-offset: 4
2050  * End:
2051  * vim600: sw=4 ts=4 fdm=marker
2052  * vim<600: sw=4 ts=4
2053  */

/* [<][>][^][v][top][bottom][index][help] */