root/ext/intl/normalizer/normalizer_normalize.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. PHP_FUNCTION
  2. PHP_FUNCTION

   1 /*
   2    +----------------------------------------------------------------------+
   3    | PHP Version 5                                                                                                                |
   4    +----------------------------------------------------------------------+
   5    | This source file is subject to version 3.01 of the PHP license,      |
   6    | that is bundled with this package in the file LICENSE, and is                |
   7    | available through the world-wide-web at the following url:                   |
   8    | http://www.php.net/license/3_01.txt                                                                  |
   9    | If you did not receive a copy of the PHP license and are unable to   |
  10    | obtain it through the world-wide-web, please send a note to                  |
  11    | license@php.net so we can mail you a copy immediately.                               |
  12    +----------------------------------------------------------------------+
  13    | Authors: Ed Batutis <ed@batutis.com>                                                                 |
  14    +----------------------------------------------------------------------+
  15  */
  16 
  17 #ifdef HAVE_CONFIG_H
  18 #include "config.h"
  19 #endif
  20 
  21 #include "php_intl.h"
  22 #include "unicode/unorm.h"
  23 #include "normalizer.h"
  24 #include "normalizer_class.h"
  25 #include "normalizer_normalize.h"
  26 #include "intl_convert.h"
  27 
  28 /* {{{ proto string Normalizer::normalize( string $input [, string $form = FORM_C] )
  29  * Normalize a string. }}} */
  30 /* {{{ proto string normalizer_normalize( string $input [, string $form = FORM_C] )
  31  * Normalize a string.
  32  */
  33 PHP_FUNCTION( normalizer_normalize )
  34 {
  35         char*                   input = NULL;
  36         /* form is optional, defaults to FORM_C */
  37         long                    form = NORMALIZER_DEFAULT;
  38         int                     input_len = 0;
  39                 
  40         UChar*                  uinput = NULL;
  41         int                     uinput_len = 0;
  42         int                     expansion_factor = 1;
  43         UErrorCode              status = U_ZERO_ERROR;
  44                 
  45         UChar*                  uret_buf = NULL;
  46         int                     uret_len = 0;
  47                 
  48         char*                   ret_buf = NULL;
  49         int32_t                 ret_len = 0;
  50 
  51         int32_t                 size_needed;
  52                 
  53         intl_error_reset( NULL TSRMLS_CC );
  54 
  55         /* Parse parameters. */
  56         if( zend_parse_method_parameters( ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "s|l",
  57                                 &input, &input_len, &form ) == FAILURE )
  58         {
  59                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  60                                                  "normalizer_normalize: unable to parse input params", 0 TSRMLS_CC );
  61 
  62                 RETURN_FALSE;
  63         }
  64 
  65         expansion_factor = 1;
  66 
  67         switch(form) {
  68                 case NORMALIZER_NONE:
  69                         break;
  70                 case NORMALIZER_FORM_D:
  71                         expansion_factor = 3;
  72                         break;
  73                 case NORMALIZER_FORM_KD:
  74                         expansion_factor = 3;
  75                         break;
  76                 case NORMALIZER_FORM_C:
  77                 case NORMALIZER_FORM_KC:
  78                         break;
  79                 default:
  80                         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
  81                                                 "normalizer_normalize: illegal normalization form", 0 TSRMLS_CC );
  82                         RETURN_FALSE;
  83         }
  84 
  85         /*
  86          * Normalize string (converting it to UTF-16 first).
  87          */
  88 
  89         /* First convert the string to UTF-16. */
  90         intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
  91 
  92         if( U_FAILURE( status ) )
  93         {
  94                 /* Set global error code. */
  95                 intl_error_set_code( NULL, status TSRMLS_CC );
  96 
  97                 /* Set error messages. */
  98                 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
  99                 if (uinput) {
 100                         efree( uinput );
 101                 }
 102                 RETURN_FALSE;
 103         }
 104 
 105 
 106         /* Allocate memory for the destination buffer for normalization */
 107         uret_len = uinput_len * expansion_factor;
 108         uret_buf = eumalloc( uret_len + 1 );
 109 
 110         /* normalize */
 111         size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
 112         
 113         /* Bail out if an unexpected error occurred.
 114          * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
 115          * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string is empty).
 116          */     
 117         if( U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && status != U_STRING_NOT_TERMINATED_WARNING ) {
 118                 efree( uret_buf );
 119                 efree( uinput );
 120                 RETURN_NULL();
 121         }
 122 
 123         if ( size_needed > uret_len ) {
 124                 /* realloc does not seem to work properly - memory is corrupted
 125                  * uret_buf =  eurealloc(uret_buf, size_needed + 1);
 126                  */
 127                 efree( uret_buf );
 128                 uret_buf = eumalloc( size_needed + 1 );
 129                 uret_len = size_needed;
 130 
 131                 status = U_ZERO_ERROR;
 132 
 133                 /* try normalize again */
 134                 size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
 135 
 136                 /* Bail out if an unexpected error occurred. */
 137                 if( U_FAILURE(status)  ) {
 138                         /* Set error messages. */
 139                         intl_error_set_custom_msg( NULL,"Error normalizing string", 0 TSRMLS_CC );
 140                         efree( uret_buf );
 141                         efree( uinput );
 142                         RETURN_FALSE;
 143                 }
 144         }
 145 
 146         efree( uinput );
 147 
 148         /* the buffer we actually used */
 149         uret_len = size_needed;
 150 
 151         /* Convert normalized string from UTF-16 to UTF-8. */
 152         intl_convert_utf16_to_utf8( &ret_buf, &ret_len, uret_buf, uret_len, &status );
 153         efree( uret_buf );
 154         if( U_FAILURE( status ) )
 155         {
 156                 intl_error_set( NULL, status,
 157                                 "normalizer_normalize: error converting normalized text UTF-8", 0 TSRMLS_CC );
 158                 RETURN_FALSE;
 159         }
 160 
 161         /* Return it. */
 162         RETVAL_STRINGL( ret_buf, ret_len, FALSE );
 163 }
 164 /* }}} */
 165 
 166 /* {{{ proto bool Normalizer::isNormalized( string $input [, string $form = FORM_C] )
 167  * Test if a string is in a given normalization form. }}} */
 168 /* {{{ proto bool normalizer_is_normalize( string $input [, string $form = FORM_C] )
 169  * Test if a string is in a given normalization form.
 170  */
 171 PHP_FUNCTION( normalizer_is_normalized )
 172 {
 173         char*           input = NULL;
 174         /* form is optional, defaults to FORM_C */
 175         long            form = NORMALIZER_DEFAULT;
 176         int             input_len = 0;
 177 
 178         UChar*          uinput = NULL;
 179         int             uinput_len = 0;
 180         UErrorCode      status = U_ZERO_ERROR;
 181                 
 182         UBool           uret = FALSE;
 183                 
 184         intl_error_reset( NULL TSRMLS_CC );
 185 
 186         /* Parse parameters. */
 187         if( zend_parse_method_parameters( ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "s|l",
 188                                 &input, &input_len, &form) == FAILURE )
 189         {
 190                 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 191                                 "normalizer_is_normalized: unable to parse input params", 0 TSRMLS_CC );
 192 
 193                 RETURN_FALSE;
 194         }
 195 
 196         switch(form) {
 197                 /* case NORMALIZER_NONE: not allowed - doesn't make sense */
 198 
 199                 case NORMALIZER_FORM_D:
 200                 case NORMALIZER_FORM_KD:
 201                 case NORMALIZER_FORM_C:
 202                 case NORMALIZER_FORM_KC:
 203                         break;
 204                 default:
 205                         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
 206                                                 "normalizer_normalize: illegal normalization form", 0 TSRMLS_CC );
 207                         RETURN_FALSE;
 208         }
 209 
 210 
 211         /*
 212          * Test normalization of string (converting it to UTF-16 first).
 213          */
 214 
 215         /* First convert the string to UTF-16. */
 216         intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
 217 
 218         if( U_FAILURE( status ) )
 219         {
 220                 /* Set global error code. */
 221                 intl_error_set_code( NULL, status TSRMLS_CC );
 222 
 223                 /* Set error messages. */
 224                 intl_error_set_custom_msg( NULL, "Error converting string to UTF-16.", 0 TSRMLS_CC );
 225                 if (uinput) {
 226                         efree( uinput );
 227                 }
 228                 RETURN_FALSE;
 229         }
 230 
 231 
 232         /* test string */
 233         uret = unorm_isNormalizedWithOptions( uinput, uinput_len, form, (int32_t) 0 /* options */, &status);
 234         
 235         efree( uinput );
 236 
 237         /* Bail out if an unexpected error occurred. */
 238         if( U_FAILURE(status)  ) {
 239                 /* Set error messages. */
 240                 intl_error_set_custom_msg( NULL,"Error testing if string is the given normalization form.", 0 TSRMLS_CC );
 241                 RETURN_FALSE;
 242         }
 243 
 244         if ( uret )
 245                 RETURN_TRUE;
 246                                 
 247         RETURN_FALSE;
 248 }
 249 /* }}} */
 250 
 251 /*
 252  * Local variables:
 253  * tab-width: 4
 254  * c-basic-offset: 4
 255  * End:
 256  * vim600: noet sw=4 ts=4 fdm=marker
 257  * vim<600: noet sw=4 ts=4
 258  */

/* [<][>][^][v][top][bottom][index][help] */