00001 <?php 00002 # Copyright (C) 2004 Brion Vibber <brion@pobox.com> 00003 # http://www.mediawiki.org/ 00004 # 00005 # This program is free software; you can redistribute it and/or modify 00006 # it under the terms of the GNU General Public License as published by 00007 # the Free Software Foundation; either version 2 of the License, or 00008 # (at your option) any later version. 00009 # 00010 # This program is distributed in the hope that it will be useful, 00011 # but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 # GNU General Public License for more details. 00014 # 00015 # You should have received a copy of the GNU General Public License along 00016 # with this program; if not, write to the Free Software Foundation, Inc., 00017 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 00018 # http://www.gnu.org/copyleft/gpl.html 00019 00020 00021 if( php_sapi_name() != 'cli' ) { 00022 die( "Run me from the command line please.\n" ); 00023 } 00024 00026 if( isset( $_SERVER['argv'] ) && in_array( '--icu', $_SERVER['argv'] ) ) { 00027 dl( 'php_utfnormal.so' ); 00028 } 00029 00030 #ini_set( 'memory_limit', '40M' ); 00031 00032 require_once 'PHPUnit/Framework.php'; 00033 require_once 'PHPUnit/TextUI/TestRunner.php'; 00034 00035 require_once 'UtfNormal.php'; 00036 00045 class CleanUpTest extends PHPUnit_Framework_TestCase { 00047 function setUp() { 00048 } 00049 00051 function tearDown() { 00052 } 00053 00055 function testAscii() { 00056 $text = 'This is plain ASCII text.'; 00057 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) ); 00058 } 00059 00061 function testNull() { 00062 $text = "a \x00 null"; 00063 $expect = "a \xef\xbf\xbd null"; 00064 $this->assertEquals( 00065 bin2hex( $expect ), 00066 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00067 } 00068 00070 function testLatin() { 00071 $text = "L'\xc3\xa9cole"; 00072 $this->assertEquals( $text, UtfNormal::cleanUp( $text ) ); 00073 } 00074 00076 function testLatinNormal() { 00077 $text = "L'e\xcc\x81cole"; 00078 $expect = "L'\xc3\xa9cole"; 00079 $this->assertEquals( $expect, UtfNormal::cleanUp( $text ) ); 00080 } 00081 00086 function XtestAllChars() { 00087 $rep = UTF8_REPLACEMENT; 00088 global $utfCanonicalComp, $utfCanonicalDecomp; 00089 for( $i = 0x0; $i < UNICODE_MAX; $i++ ) { 00090 $char = codepointToUtf8( $i ); 00091 $clean = UtfNormal::cleanUp( $char ); 00092 $x = sprintf( "%04X", $i ); 00093 if( $i % 0x1000 == 0 ) echo "U+$x\n"; 00094 if( $i == 0x0009 || 00095 $i == 0x000a || 00096 $i == 0x000d || 00097 ($i > 0x001f && $i < UNICODE_SURROGATE_FIRST) || 00098 ($i > UNICODE_SURROGATE_LAST && $i < 0xfffe ) || 00099 ($i > 0xffff && $i <= UNICODE_MAX ) ) { 00100 if( isset( $utfCanonicalComp[$char] ) || isset( $utfCanonicalDecomp[$char] ) ) { 00101 $comp = UtfNormal::NFC( $char ); 00102 $this->assertEquals( 00103 bin2hex( $comp ), 00104 bin2hex( $clean ), 00105 "U+$x should be decomposed" ); 00106 } else { 00107 $this->assertEquals( 00108 bin2hex( $char ), 00109 bin2hex( $clean ), 00110 "U+$x should be intact" ); 00111 } 00112 } else { 00113 $this->assertEquals( bin2hex( $rep ), bin2hex( $clean ), $x ); 00114 } 00115 } 00116 } 00117 00119 function testAllBytes() { 00120 $this->doTestBytes( '', '' ); 00121 $this->doTestBytes( 'x', '' ); 00122 $this->doTestBytes( '', 'x' ); 00123 $this->doTestBytes( 'x', 'x' ); 00124 } 00125 00127 function doTestBytes( $head, $tail ) { 00128 for( $i = 0x0; $i < 256; $i++ ) { 00129 $char = $head . chr( $i ) . $tail; 00130 $clean = UtfNormal::cleanUp( $char ); 00131 $x = sprintf( "%02X", $i ); 00132 if( $i == 0x0009 || 00133 $i == 0x000a || 00134 $i == 0x000d || 00135 ($i > 0x001f && $i < 0x80) ) { 00136 $this->assertEquals( 00137 bin2hex( $char ), 00138 bin2hex( $clean ), 00139 "ASCII byte $x should be intact" ); 00140 if( $char != $clean ) return; 00141 } else { 00142 $norm = $head . UTF8_REPLACEMENT . $tail; 00143 $this->assertEquals( 00144 bin2hex( $norm ), 00145 bin2hex( $clean ), 00146 "Forbidden byte $x should be rejected" ); 00147 if( $norm != $clean ) return; 00148 } 00149 } 00150 } 00151 00153 function testDoubleBytes() { 00154 $this->doTestDoubleBytes( '', '' ); 00155 $this->doTestDoubleBytes( 'x', '' ); 00156 $this->doTestDoubleBytes( '', 'x' ); 00157 $this->doTestDoubleBytes( 'x', 'x' ); 00158 } 00159 00163 function doTestDoubleBytes( $head, $tail ) { 00164 for( $first = 0xc0; $first < 0x100; $first++ ) { 00165 for( $second = 0x80; $second < 0x100; $second++ ) { 00166 $char = $head . chr( $first ) . chr( $second ) . $tail; 00167 $clean = UtfNormal::cleanUp( $char ); 00168 $x = sprintf( "%02X,%02X", $first, $second ); 00169 if( $first > 0xc1 && 00170 $first < 0xe0 && 00171 $second < 0xc0 ) { 00172 $norm = UtfNormal::NFC( $char ); 00173 $this->assertEquals( 00174 bin2hex( $norm ), 00175 bin2hex( $clean ), 00176 "Pair $x should be intact" ); 00177 if( $norm != $clean ) return; 00178 } elseif( $first > 0xfd || $second > 0xbf ) { 00179 # fe and ff are not legal head bytes -- expect two replacement chars 00180 $norm = $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail; 00181 $this->assertEquals( 00182 bin2hex( $norm ), 00183 bin2hex( $clean ), 00184 "Forbidden pair $x should be rejected" ); 00185 if( $norm != $clean ) return; 00186 } else { 00187 $norm = $head . UTF8_REPLACEMENT . $tail; 00188 $this->assertEquals( 00189 bin2hex( $norm ), 00190 bin2hex( $clean ), 00191 "Forbidden pair $x should be rejected" ); 00192 if( $norm != $clean ) return; 00193 } 00194 } 00195 } 00196 } 00197 00199 function testTripleBytes() { 00200 $this->doTestTripleBytes( '', '' ); 00201 $this->doTestTripleBytes( 'x', '' ); 00202 $this->doTestTripleBytes( '', 'x' ); 00203 $this->doTestTripleBytes( 'x', 'x' ); 00204 } 00205 00207 function doTestTripleBytes( $head, $tail ) { 00208 for( $first = 0xc0; $first < 0x100; $first++ ) { 00209 for( $second = 0x80; $second < 0x100; $second++ ) { 00210 #for( $third = 0x80; $third < 0x100; $third++ ) { 00211 for( $third = 0x80; $third < 0x81; $third++ ) { 00212 $char = $head . chr( $first ) . chr( $second ) . chr( $third ) . $tail; 00213 $clean = UtfNormal::cleanUp( $char ); 00214 $x = sprintf( "%02X,%02X,%02X", $first, $second, $third ); 00215 if( $first >= 0xe0 && 00216 $first < 0xf0 && 00217 $second < 0xc0 && 00218 $third < 0xc0 ) { 00219 if( $first == 0xe0 && $second < 0xa0 ) { 00220 $this->assertEquals( 00221 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00222 bin2hex( $clean ), 00223 "Overlong triplet $x should be rejected" ); 00224 } elseif( $first == 0xed && 00225 ( chr( $first ) . chr( $second ) . chr( $third )) >= UTF8_SURROGATE_FIRST ) { 00226 $this->assertEquals( 00227 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00228 bin2hex( $clean ), 00229 "Surrogate triplet $x should be rejected" ); 00230 } else { 00231 $this->assertEquals( 00232 bin2hex( UtfNormal::NFC( $char ) ), 00233 bin2hex( $clean ), 00234 "Triplet $x should be intact" ); 00235 } 00236 } elseif( $first > 0xc1 && $first < 0xe0 && $second < 0xc0 ) { 00237 $this->assertEquals( 00238 bin2hex( UtfNormal::NFC( $head . chr( $first ) . chr( $second ) ) . UTF8_REPLACEMENT . $tail ), 00239 bin2hex( $clean ), 00240 "Valid 2-byte $x + broken tail" ); 00241 } elseif( $second > 0xc1 && $second < 0xe0 && $third < 0xc0 ) { 00242 $this->assertEquals( 00243 bin2hex( $head . UTF8_REPLACEMENT . UtfNormal::NFC( chr( $second ) . chr( $third ) . $tail ) ), 00244 bin2hex( $clean ), 00245 "Broken head + valid 2-byte $x" ); 00246 } elseif( ( $first > 0xfd || $second > 0xfd ) && 00247 ( ( $second > 0xbf && $third > 0xbf ) || 00248 ( $second < 0xc0 && $third < 0xc0 ) || 00249 ( $second > 0xfd ) || 00250 ( $third > 0xfd ) ) ) { 00251 # fe and ff are not legal head bytes -- expect three replacement chars 00252 $this->assertEquals( 00253 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ), 00254 bin2hex( $clean ), 00255 "Forbidden triplet $x should be rejected" ); 00256 } elseif( $first > 0xc2 && $second < 0xc0 && $third < 0xc0 ) { 00257 $this->assertEquals( 00258 bin2hex( $head . UTF8_REPLACEMENT . $tail ), 00259 bin2hex( $clean ), 00260 "Forbidden triplet $x should be rejected" ); 00261 } else { 00262 $this->assertEquals( 00263 bin2hex( $head . UTF8_REPLACEMENT . UTF8_REPLACEMENT . $tail ), 00264 bin2hex( $clean ), 00265 "Forbidden triplet $x should be rejected" ); 00266 } 00267 } 00268 } 00269 } 00270 } 00271 00273 function testChunkRegression() { 00274 # Check for regression against a chunking bug 00275 $text = "\x46\x55\xb8" . 00276 "\xdc\x96" . 00277 "\xee" . 00278 "\xe7" . 00279 "\x44" . 00280 "\xaa" . 00281 "\x2f\x25"; 00282 $expect = "\x46\x55\xef\xbf\xbd" . 00283 "\xdc\x96" . 00284 "\xef\xbf\xbd" . 00285 "\xef\xbf\xbd" . 00286 "\x44" . 00287 "\xef\xbf\xbd" . 00288 "\x2f\x25"; 00289 00290 $this->assertEquals( 00291 bin2hex( $expect ), 00292 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00293 } 00294 00296 function testInterposeRegression() { 00297 $text = "\x4e\x30" . 00298 "\xb1" . # bad tail 00299 "\x3a" . 00300 "\x92" . # bad tail 00301 "\x62\x3a" . 00302 "\x84" . # bad tail 00303 "\x43" . 00304 "\xc6" . # bad head 00305 "\x3f" . 00306 "\x92" . # bad tail 00307 "\xad" . # bad tail 00308 "\x7d" . 00309 "\xd9\x95"; 00310 00311 $expect = "\x4e\x30" . 00312 "\xef\xbf\xbd" . 00313 "\x3a" . 00314 "\xef\xbf\xbd" . 00315 "\x62\x3a" . 00316 "\xef\xbf\xbd" . 00317 "\x43" . 00318 "\xef\xbf\xbd" . 00319 "\x3f" . 00320 "\xef\xbf\xbd" . 00321 "\xef\xbf\xbd" . 00322 "\x7d" . 00323 "\xd9\x95"; 00324 00325 $this->assertEquals( 00326 bin2hex( $expect ), 00327 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00328 } 00329 00331 function testOverlongRegression() { 00332 $text = "\x67" . 00333 "\x1a" . # forbidden ascii 00334 "\xea" . # bad head 00335 "\xc1\xa6" . # overlong sequence 00336 "\xad" . # bad tail 00337 "\x1c" . # forbidden ascii 00338 "\xb0" . # bad tail 00339 "\x3c" . 00340 "\x9e"; # bad tail 00341 $expect = "\x67" . 00342 "\xef\xbf\xbd" . 00343 "\xef\xbf\xbd" . 00344 "\xef\xbf\xbd" . 00345 "\xef\xbf\xbd" . 00346 "\xef\xbf\xbd" . 00347 "\xef\xbf\xbd" . 00348 "\x3c" . 00349 "\xef\xbf\xbd"; 00350 $this->assertEquals( 00351 bin2hex( $expect ), 00352 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00353 } 00354 00356 function testSurrogateRegression() { 00357 $text = "\xed\xb4\x96" . # surrogate 0xDD16 00358 "\x83" . # bad tail 00359 "\xb4" . # bad tail 00360 "\xac"; # bad head 00361 $expect = "\xef\xbf\xbd" . 00362 "\xef\xbf\xbd" . 00363 "\xef\xbf\xbd" . 00364 "\xef\xbf\xbd"; 00365 $this->assertEquals( 00366 bin2hex( $expect ), 00367 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00368 } 00369 00371 function testBomRegression() { 00372 $text = "\xef\xbf\xbe" . # U+FFFE, illegal char 00373 "\xb2" . # bad tail 00374 "\xef" . # bad head 00375 "\x59"; 00376 $expect = "\xef\xbf\xbd" . 00377 "\xef\xbf\xbd" . 00378 "\xef\xbf\xbd" . 00379 "\x59"; 00380 $this->assertEquals( 00381 bin2hex( $expect ), 00382 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00383 } 00384 00386 function testForbiddenRegression() { 00387 $text = "\xef\xbf\xbf"; # U+FFFF, illegal char 00388 $expect = "\xef\xbf\xbd"; 00389 $this->assertEquals( 00390 bin2hex( $expect ), 00391 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00392 } 00393 00395 function testHangulRegression() { 00396 $text = "\xed\x9c\xaf" . # Hangul char 00397 "\xe1\x87\x81"; # followed by another final jamo 00398 $expect = $text; # Should *not* change. 00399 $this->assertEquals( 00400 bin2hex( $expect ), 00401 bin2hex( UtfNormal::cleanUp( $text ) ) ); 00402 } 00403 } 00404 00405 00406 $suite = new PHPUnit_Framework_TestSuite( 'CleanUpTest' ); 00407 $result = PHPUnit_TextUI_TestRunner::run( $suite ); 00408 00409 if( !$result->wasSuccessful() ) { 00410 exit( -1 ); 00411 } 00412 exit( 0 );