root/ext/intl/grapheme/grapheme_string.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. grapheme_register_constants
  2. PHP_FUNCTION
  3. PHP_FUNCTION
  4. PHP_FUNCTION
  5. PHP_FUNCTION
  6. PHP_FUNCTION
  7. PHP_FUNCTION
  8. strstr_common_handler
  9. PHP_FUNCTION
  10. PHP_FUNCTION
  11. grapheme_extract_charcount_iter
  12. grapheme_extract_bytecount_iter
  13. grapheme_extract_count_iter
  14. PHP_FUNCTION

   1 /*
   2    +----------------------------------------------------------------------+
   3    | PHP Version 5                                                                                                                |
   4    +----------------------------------------------------------------------+
   5    | This source file is subject to version 3.01 of the PHP license,      |
   6    | that is bundled with this package in the file LICENSE, and is                |
   7    | available through the world-wide-web at the following url:                   |
   8    | http://www.php.net/license/3_01.txt                                                                  |
   9    | If you did not receive a copy of the PHP license and are unable to   |
  10    | obtain it through the world-wide-web, please send a note to                  |
  11    | license@php.net so we can mail you a copy immediately.                               |
  12    +----------------------------------------------------------------------+
  13    | Author: Ed Batutis <ed@batutis.com>                                                                  |
  14    +----------------------------------------------------------------------+
  15  */
  16 
  17 /* {{{ includes */
  18 #ifdef HAVE_CONFIG_H
  19 #include "config.h"
  20 #endif
  21 
  22 #include <php.h>
  23 #include "grapheme.h"
  24 #include "grapheme_util.h"
  25 
  26 #include <unicode/utypes.h>
  27 #include <unicode/ucol.h>
  28 #include <unicode/ustring.h>
  29 #include <unicode/ubrk.h>
  30 
  31 #include "ext/standard/php_string.h"
  32 
  33 /* }}} */
  34 
  35 #define GRAPHEME_EXTRACT_TYPE_COUNT             0
  36 #define GRAPHEME_EXTRACT_TYPE_MAXBYTES  1
  37 #define GRAPHEME_EXTRACT_TYPE_MAXCHARS  2
  38 #define GRAPHEME_EXTRACT_TYPE_MIN       GRAPHEME_EXTRACT_TYPE_COUNT
  39 #define GRAPHEME_EXTRACT_TYPE_MAX       GRAPHEME_EXTRACT_TYPE_MAXCHARS
  40 
  41 
  42 /* {{{ grapheme_register_constants
  43  * Register API constants
  44  */
  45 void grapheme_register_constants( INIT_FUNC_ARGS )
  46 {
  47         REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
  48         REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
  49         REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
  50 }
  51 /* }}} */
  52 
  53 /* {{{ proto int grapheme_strlen(string str)
  54    Get number of graphemes in a string */
  55 PHP_FUNCTION(grapheme_strlen)
  56 {
  57         unsigned char* string;
  58         int string_len;
  59         UChar* ustring = NULL;
  60         int ustring_len = 0;
  61         int ret_len;
  62         UErrorCode status;
  63 
  64         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
  65 
  66                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  67                          "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
  68 
  69                 RETURN_FALSE;
  70         }
  71 
  72         ret_len = grapheme_ascii_check(string, string_len);
  73 
  74         if ( ret_len >= 0 )
  75                 RETURN_LONG(ret_len);
  76 
  77         /* convert the string to UTF-16. */
  78         status = U_ZERO_ERROR;
  79         intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
  80 
  81         if ( U_FAILURE( status ) ) {
  82                 /* Set global error code. */
  83                 intl_error_set_code( NULL, status TSRMLS_CC );
  84 
  85                 /* Set error messages. */
  86                 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
  87                 if (ustring) {
  88                         efree( ustring );
  89                 }
  90                 RETURN_NULL();
  91         }
  92 
  93         ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
  94 
  95         if (ustring) {
  96                 efree( ustring );
  97         }
  98 
  99         if (ret_len >= 0) {
 100                 RETVAL_LONG(ret_len);
 101         } else {
 102                 RETVAL_FALSE;
 103         }
 104 }
 105 /* }}} */
 106 
 107 /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
 108    Find position of first occurrence of a string within another */
 109 PHP_FUNCTION(grapheme_strpos)
 110 {
 111         unsigned char *haystack, *needle;
 112         int haystack_len, needle_len;
 113         unsigned char *found;
 114         long loffset = 0;
 115         int32_t offset = 0;
 116         int ret_pos;
 117 
 118         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
 119 
 120                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 121                          "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
 122 
 123                 RETURN_FALSE;
 124         }
 125 
 126         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
 127 
 128                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
 129 
 130                 RETURN_FALSE;
 131         }
 132 
 133         /* we checked that it will fit: */
 134         offset = (int32_t) loffset;
 135 
 136         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
 137 
 138         if (needle_len == 0) {
 139 
 140                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
 141 
 142                 RETURN_FALSE;
 143         }
 144 
 145 
 146         /* quick check to see if the string might be there
 147          * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
 148         */
 149         found = (unsigned char *)php_memnstr((char *)haystack + offset, (char *)needle, needle_len, (char *)haystack + haystack_len);
 150 
 151         /* if it isn't there the we are done */
 152         if (!found) {
 153                 RETURN_FALSE;
 154         }
 155 
 156         /* if it is there, and if the haystack is ascii, we are all done */
 157         if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
 158 
 159                 RETURN_LONG(found - haystack);
 160         }
 161 
 162         /* do utf16 part of the strpos */
 163         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ TSRMLS_CC );
 164 
 165         if ( ret_pos >= 0 ) {
 166                 RETURN_LONG(ret_pos);
 167         } else {
 168                 RETURN_FALSE;
 169         }
 170 
 171 }
 172 /* }}} */
 173 
 174 /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
 175    Find position of first occurrence of a string within another, ignoring case differences */
 176 PHP_FUNCTION(grapheme_stripos)
 177 {
 178         unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
 179         int haystack_len, needle_len;
 180         unsigned char *found;
 181         long loffset = 0;
 182         int32_t offset = 0;
 183         int ret_pos;
 184         int is_ascii;
 185 
 186         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
 187 
 188                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 189                          "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
 190 
 191                 RETURN_FALSE;
 192         }
 193 
 194         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
 195 
 196                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
 197 
 198                 RETURN_FALSE;
 199         }
 200 
 201         /* we checked that it will fit: */
 202         offset = (int32_t) loffset;
 203 
 204         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
 205 
 206         if (needle_len == 0) {
 207 
 208                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
 209 
 210                 RETURN_FALSE;
 211         }
 212 
 213 
 214         is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
 215 
 216         if ( is_ascii ) {
 217                 needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
 218                 php_strtolower((char *)needle_dup, needle_len);
 219                 haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
 220                 php_strtolower((char *)haystack_dup, haystack_len);
 221 
 222                 found = (unsigned char*) php_memnstr((char *)haystack_dup + offset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
 223 
 224                 efree(haystack_dup);
 225                 efree(needle_dup);
 226 
 227                 if (found) {
 228                         RETURN_LONG(found - haystack_dup);
 229                 }
 230 
 231                 /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
 232                 if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
 233                         RETURN_FALSE;
 234                 }
 235         }
 236 
 237         /* do utf16 part of the strpos */
 238         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ TSRMLS_CC );
 239 
 240         if ( ret_pos >= 0 ) {
 241                 RETURN_LONG(ret_pos);
 242         } else {
 243                 RETURN_FALSE;
 244         }
 245 
 246 }
 247 /* }}} */
 248 
 249 /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
 250    Find position of last occurrence of a string within another */
 251 PHP_FUNCTION(grapheme_strrpos)
 252 {
 253         unsigned char *haystack, *needle;
 254         int haystack_len, needle_len;
 255         long loffset = 0;
 256         int32_t offset = 0;
 257         int32_t ret_pos;
 258         int is_ascii;
 259 
 260         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
 261 
 262                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 263                          "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
 264 
 265                 RETURN_FALSE;
 266         }
 267 
 268         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
 269 
 270                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
 271 
 272                 RETURN_FALSE;
 273         }
 274 
 275         /* we checked that it will fit: */
 276         offset = (int32_t) loffset;
 277 
 278         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
 279 
 280         if (needle_len == 0) {
 281 
 282                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
 283 
 284                 RETURN_FALSE;
 285         }
 286 
 287         is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
 288 
 289         if ( is_ascii ) {
 290 
 291                 ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
 292 
 293 
 294                 if ( ret_pos >= 0 ) {
 295                         RETURN_LONG(ret_pos);
 296                 }
 297 
 298                 /* if the needle was ascii too, we are done */
 299 
 300                 if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
 301                         RETURN_FALSE;
 302                 }
 303 
 304                 /* else we need to continue via utf16 */
 305         }
 306 
 307         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */ TSRMLS_CC);
 308 
 309         if ( ret_pos >= 0 ) {
 310                 RETURN_LONG(ret_pos);
 311         } else {
 312                 RETURN_FALSE;
 313         }
 314 
 315 
 316 }
 317 /* }}} */
 318 
 319 /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
 320    Find position of last occurrence of a string within another, ignoring case */
 321 PHP_FUNCTION(grapheme_strripos)
 322 {
 323         unsigned char *haystack, *needle;
 324         int haystack_len, needle_len;
 325         long loffset = 0;
 326         int32_t offset = 0;
 327         int32_t ret_pos;
 328         int is_ascii;
 329 
 330         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
 331 
 332                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 333                          "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
 334 
 335                 RETURN_FALSE;
 336         }
 337 
 338         if ( OUTSIDE_STRING(loffset, haystack_len) ) {
 339 
 340                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
 341 
 342                 RETURN_FALSE;
 343         }
 344 
 345         /* we checked that it will fit: */
 346         offset = (int32_t) loffset;
 347 
 348         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
 349 
 350         if (needle_len == 0) {
 351 
 352                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
 353 
 354                 RETURN_FALSE;
 355         }
 356 
 357         is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
 358 
 359         if ( is_ascii ) {
 360                 unsigned char *needle_dup, *haystack_dup;
 361 
 362                 needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
 363                 php_strtolower((char *)needle_dup, needle_len);
 364                 haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
 365                 php_strtolower((char *)haystack_dup, haystack_len);
 366 
 367                 ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
 368 
 369                 efree(haystack_dup);
 370                 efree(needle_dup);
 371 
 372                 if ( ret_pos >= 0 ) {
 373                         RETURN_LONG(ret_pos);
 374                 }
 375 
 376                 /* if the needle was ascii too, we are done */
 377 
 378                 if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
 379                         RETURN_FALSE;
 380                 }
 381 
 382                 /* else we need to continue via utf16 */
 383         }
 384 
 385         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL,  1 /* f_ignore_case */, 1 /*last */ TSRMLS_CC);
 386 
 387         if ( ret_pos >= 0 ) {
 388                 RETURN_LONG(ret_pos);
 389         } else {
 390                 RETURN_FALSE;
 391         }
 392 
 393 
 394 }
 395 /* }}} */
 396 
 397 /* {{{ proto string grapheme_substr(string str, int start [, int length])
 398    Returns part of a string */
 399 PHP_FUNCTION(grapheme_substr)
 400 {
 401         unsigned char *str, *sub_str;
 402         UChar *ustr;
 403         int str_len, sub_str_len, ustr_len;
 404         long lstart = 0, length = 0;
 405         int32_t start = 0;
 406         int iter_val;
 407         UErrorCode status;
 408         unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
 409         UBreakIterator* bi = NULL;
 410         int sub_str_start_pos, sub_str_end_pos;
 411         int32_t (*iter_func)(UBreakIterator *);
 412 
 413         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
 414 
 415                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 416                          "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
 417 
 418                 RETURN_FALSE;
 419         }
 420 
 421         if ( OUTSIDE_STRING(lstart, str_len) ) {
 422 
 423                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
 424 
 425                 RETURN_FALSE;
 426         }
 427 
 428         /* we checked that it will fit: */
 429         start = (int32_t) lstart;
 430 
 431         /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
 432 
 433         if ( grapheme_ascii_check(str, str_len) >= 0 ) {
 434                 grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
 435 
 436                 if ( NULL == sub_str ) {
 437                         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 TSRMLS_CC );
 438                         RETURN_FALSE;
 439                 }
 440 
 441                 RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
 442         }
 443 
 444         ustr = NULL;
 445         ustr_len = 0;
 446         status = U_ZERO_ERROR;
 447         intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
 448 
 449         if ( U_FAILURE( status ) ) {
 450                 /* Set global error code. */
 451                 intl_error_set_code( NULL, status TSRMLS_CC );
 452 
 453                 /* Set error messages. */
 454                 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
 455                 if (ustr) {
 456                         efree( ustr );
 457                 }
 458                 RETURN_FALSE;
 459         }
 460 
 461         bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
 462 
 463         if( U_FAILURE(status) ) {
 464                 RETURN_FALSE;
 465         }
 466 
 467         ubrk_setText(bi, ustr, ustr_len,        &status);
 468 
 469         if ( start < 0 ) {
 470                 iter_func = ubrk_previous;
 471                 ubrk_last(bi);
 472                 iter_val = 1;
 473         }
 474         else {
 475                 iter_func = ubrk_next;
 476                 iter_val = -1;
 477         }
 478 
 479         sub_str_start_pos = 0;
 480 
 481         while ( start ) {
 482                 sub_str_start_pos = iter_func(bi);
 483 
 484                 if ( UBRK_DONE == sub_str_start_pos ) {
 485                         break;
 486                 }
 487 
 488                 start += iter_val;
 489         }
 490 
 491         if ( 0 != start || sub_str_start_pos >= ustr_len ) {
 492 
 493                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
 494 
 495                 if (ustr) {
 496                         efree(ustr);
 497                 }
 498                 ubrk_close(bi);
 499                 RETURN_FALSE;
 500         }
 501 
 502         if (ZEND_NUM_ARGS() <= 2) {
 503 
 504                 /* no length supplied, return the rest of the string */
 505 
 506                 sub_str = NULL;
 507                 sub_str_len = 0;
 508                 status = U_ZERO_ERROR;
 509                 intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
 510 
 511                 if (ustr) {
 512                         efree( ustr );
 513                 }
 514                 ubrk_close( bi );
 515 
 516                 if ( U_FAILURE( status ) ) {
 517                         /* Set global error code. */
 518                         intl_error_set_code( NULL, status TSRMLS_CC );
 519 
 520                         /* Set error messages. */
 521                         intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
 522 
 523                         if (sub_str) {
 524                                 efree( sub_str );
 525                         }
 526 
 527                         RETURN_FALSE;
 528                 }
 529 
 530                 /* return the allocated string, not a duplicate */
 531                 RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
 532         }
 533 
 534         if(length == 0) {
 535                 /* empty length - we've validated start, we can return "" now */
 536                 if (ustr) {
 537                         efree(ustr);
 538                 }
 539                 ubrk_close(bi);
 540                 RETURN_EMPTY_STRING();          
 541         }
 542 
 543         /* find the end point of the string to return */
 544 
 545         if ( length < 0 ) {
 546                 iter_func = ubrk_previous;
 547                 ubrk_last(bi);
 548                 iter_val = 1;
 549         }
 550         else {
 551                 iter_func = ubrk_next;
 552                 iter_val = -1;
 553         }
 554 
 555         sub_str_end_pos = 0;
 556 
 557         while ( length ) {
 558                 sub_str_end_pos = iter_func(bi);
 559 
 560                 if ( UBRK_DONE == sub_str_end_pos ) {
 561                         break;
 562                 }
 563 
 564                 length += iter_val;
 565         }
 566 
 567         ubrk_close(bi);
 568 
 569         if ( UBRK_DONE == sub_str_end_pos) {
 570                 if(length < 0) {
 571                         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
 572 
 573                         efree(ustr);
 574                         RETURN_FALSE;
 575                 } else {
 576                         sub_str_end_pos = ustr_len;
 577                 }
 578         }
 579         
 580         if(sub_str_start_pos > sub_str_end_pos) {
 581                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 TSRMLS_CC );
 582 
 583                 efree(ustr);
 584                 RETURN_FALSE;
 585         }
 586 
 587         sub_str = NULL;
 588         status = U_ZERO_ERROR;
 589         intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
 590 
 591         efree( ustr );
 592 
 593         if ( U_FAILURE( status ) ) {
 594                 /* Set global error code. */
 595                 intl_error_set_code( NULL, status TSRMLS_CC );
 596 
 597                 /* Set error messages. */
 598                 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
 599 
 600                 if ( NULL != sub_str )
 601                         efree( sub_str );
 602 
 603                 RETURN_FALSE;
 604         }
 605 
 606          /* return the allocated string, not a duplicate */
 607         RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
 608 
 609 }
 610 /* }}} */
 611 
 612 /* {{{  strstr_common_handler */
 613 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
 614 {
 615         unsigned char *haystack, *needle, *found;
 616         int haystack_len, needle_len;
 617         int ret_pos, uchar_pos;
 618         zend_bool part = 0;
 619 
 620         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
 621 
 622                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 623                          "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
 624 
 625                 RETURN_FALSE;
 626         }
 627 
 628         if (needle_len == 0) {
 629 
 630                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
 631 
 632                 RETURN_FALSE;
 633         }
 634 
 635 
 636         if ( !f_ignore_case ) {
 637 
 638                 /* ASCII optimization: quick check to see if the string might be there
 639                  * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
 640                 */
 641                 found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
 642 
 643                 /* if it isn't there the we are done */
 644                 if ( !found ) {
 645                         RETURN_FALSE;
 646                 }
 647 
 648                 /* if it is there, and if the haystack is ascii, we are all done */
 649                 if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
 650                         size_t found_offset = found - haystack;
 651 
 652                         if (part) {
 653                                 RETURN_STRINGL(((char *)haystack) , found_offset, 1);
 654                         } else {
 655                                 RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
 656                         }
 657                 }
 658 
 659         }
 660 
 661         /* need to work in utf16 */
 662         ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ TSRMLS_CC );
 663 
 664         if ( ret_pos < 0 ) {
 665                 RETURN_FALSE;
 666         }
 667 
 668         /* uchar_pos is the 'nth' Unicode character position of the needle */
 669 
 670         ret_pos = 0;
 671         U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
 672 
 673         if (part) {
 674                 RETURN_STRINGL(((char *)haystack), ret_pos, 1);
 675         }
 676         else {
 677                 RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
 678         }
 679 
 680 }
 681 /* }}} */
 682 
 683 /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
 684    Finds first occurrence of a string within another */
 685 PHP_FUNCTION(grapheme_strstr)
 686 {
 687         strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
 688 }
 689 /* }}} */
 690 
 691 /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
 692    Finds first occurrence of a string within another */
 693 PHP_FUNCTION(grapheme_stristr)
 694 {
 695         strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
 696 }
 697 /* }}} */
 698 
 699 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
 700 static inline int32_t
 701 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
 702 {
 703         int pos = 0, prev_pos = 0;
 704         int ret_pos = 0, prev_ret_pos = 0;
 705 
 706         while ( 1 ) {
 707                 pos = ubrk_next(bi);
 708 
 709                 if ( UBRK_DONE == pos ) {
 710                         break;
 711                 }
 712 
 713                 /* if we are beyond our limit, then the loop is done */
 714                 if ( pos > csize ) {
 715                         break;
 716                 }
 717 
 718                 /* update our pointer in the original UTF-8 buffer by as many characters
 719                    as ubrk_next iterated over */
 720 
 721                 prev_ret_pos = ret_pos;
 722                 U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
 723 
 724                 if ( prev_ret_pos == ret_pos ) {
 725                         /* something wrong - malformed utf8? */
 726                         break;
 727                 }
 728 
 729                 prev_pos = pos;
 730         }
 731 
 732         return ret_pos;
 733 }
 734 /* }}} */
 735 
 736 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
 737 static inline int32_t
 738 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
 739 {
 740         int pos = 0, prev_pos = 0;
 741         int ret_pos = 0, prev_ret_pos = 0;
 742 
 743         while ( 1 ) {
 744                 pos = ubrk_next(bi);
 745 
 746                 if ( UBRK_DONE == pos ) {
 747                         break;
 748                 }
 749 
 750                 prev_ret_pos = ret_pos;
 751                 U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
 752 
 753                 if ( ret_pos > bsize ) {
 754                         ret_pos = prev_ret_pos;
 755                         break;
 756                 }
 757 
 758                 if ( prev_ret_pos == ret_pos ) {
 759                         /* something wrong - malformed utf8? */
 760                         break;
 761                 }
 762 
 763                 prev_pos = pos;
 764         }
 765 
 766         return ret_pos;
 767 }
 768 /* }}} */
 769 
 770 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
 771 static inline int32_t
 772 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
 773 {
 774         int pos = 0, next_pos = 0;
 775         int ret_pos = 0;
 776 
 777         while ( size ) {
 778                 next_pos = ubrk_next(bi);
 779 
 780                 if ( UBRK_DONE == next_pos ) {
 781                         break;
 782                 }
 783                 pos = next_pos;
 784                 size--;
 785         }
 786 
 787         /* pos is one past the last UChar - and represent the number of code units to
 788                 advance in the utf-8 buffer
 789         */
 790 
 791         U8_FWD_N(pstr, ret_pos, str_len, pos);
 792 
 793         return ret_pos;
 794 }
 795 /* }}} */
 796 
 797 /* {{{ grapheme extract iter function pointer array */
 798 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
 799 
 800 static grapheme_extract_iter grapheme_extract_iters[] = {
 801         &grapheme_extract_count_iter,
 802         &grapheme_extract_bytecount_iter,
 803         &grapheme_extract_charcount_iter,
 804 };
 805 /* }}} */
 806 
 807 /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
 808         Function to extract a sequence of default grapheme clusters */
 809 PHP_FUNCTION(grapheme_extract)
 810 {
 811         unsigned char *str, *pstr;
 812         UChar *ustr;
 813         int str_len, ustr_len;
 814         long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
 815         long lstart = 0; /* starting position in str in bytes */
 816         int32_t start = 0;
 817         long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
 818         UErrorCode status;
 819         unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
 820         UBreakIterator* bi = NULL;
 821         int ret_pos;
 822         zval *next = NULL; /* return offset of next part of the string */
 823 
 824         if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
 825 
 826                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 827                          "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
 828 
 829                 RETURN_FALSE;
 830         }
 831 
 832         if ( NULL != next ) {
 833                 if ( !PZVAL_IS_REF(next) ) {
 834                         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 835                                  "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
 836 
 837                         RETURN_FALSE;
 838                 }
 839                 else {
 840                         /* initialize next */
 841                         zval_dtor(next);
 842             ZVAL_LONG(next, lstart);
 843                 }
 844         }
 845 
 846         if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
 847 
 848                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 849                          "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
 850 
 851                 RETURN_FALSE;
 852         }
 853 
 854         if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
 855                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC );
 856                 RETURN_FALSE;
 857         }
 858 
 859         if ( size > INT32_MAX || size < 0) {
 860                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC );
 861                 RETURN_FALSE;
 862         }
 863         if (size == 0) {
 864                 RETURN_EMPTY_STRING();
 865         }
 866 
 867         /* we checked that it will fit: */
 868         start = (int32_t) lstart;
 869 
 870         pstr = str + start;
 871 
 872         /* just in case pstr points in the middle of a character, move forward to the start of the next char */
 873         if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
 874                 unsigned char *str_end = str + str_len;
 875 
 876                 while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
 877                         pstr++;
 878                         if ( pstr >= str_end ) {
 879                                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 880                                                                 "grapheme_extract: invalid input string", 0 TSRMLS_CC );
 881 
 882                                 RETURN_FALSE;
 883                         }
 884                 }
 885         }
 886 
 887         str_len -= (pstr - str);
 888 
 889         /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
 890                 (size + 1 because the size-th character might be the beginning of a grapheme cluster)
 891          */
 892 
 893         if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
 894         long nsize = ( size < str_len ? size : str_len );
 895                 if ( NULL != next ) {
 896                         ZVAL_LONG(next, start+nsize);
 897                 }
 898                 RETURN_STRINGL(((char *)pstr), nsize, 1);
 899         }
 900 
 901         /* convert the strings to UTF-16. */
 902         ustr = NULL;
 903         ustr_len = 0;
 904         status = U_ZERO_ERROR;
 905         intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
 906 
 907         if ( U_FAILURE( status ) ) {
 908                 /* Set global error code. */
 909                 intl_error_set_code( NULL, status TSRMLS_CC );
 910 
 911                 /* Set error messages. */
 912                 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
 913 
 914                 if ( NULL != ustr )
 915                         efree( ustr );
 916 
 917                 RETURN_FALSE;
 918         }
 919 
 920         bi = NULL;
 921         status = U_ZERO_ERROR;
 922         bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
 923 
 924         ubrk_setText(bi, ustr, ustr_len, &status);
 925 
 926         /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
 927                 can't back up. So, we will not do anything. */
 928 
 929         /* now we need to find the end of the chunk the user wants us to return */
 930 
 931         ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
 932 
 933         if (ustr) {
 934                 efree(ustr);
 935         }
 936         ubrk_close(bi);
 937 
 938         if ( NULL != next ) {
 939                 ZVAL_LONG(next, start+ret_pos);
 940         }
 941 
 942         RETURN_STRINGL(((char *)pstr), ret_pos, 1);
 943 }
 944 
 945 /* }}} */
 946 
 947 /*
 948  * Local variables:
 949  * tab-width: 4
 950  * c-basic-offset: 4
 951  * End:
 952  * vim600: fdm=marker
 953  * vim: noet sw=4 ts=4
 954  */
 955 

/* [<][>][^][v][top][bottom][index][help] */