00001 <?php
00031 define( 'MW_CHAR_REFS_REGEX',
00032 '/&([A-Za-z0-9\x80-\xff]+);
00033 |&\#([0-9]+);
00034 |&\#x([0-9A-Za-z]+);
00035 |&\#X([0-9A-Za-z]+);
00036 |(&)/x' );
00037
00043 $attrib = '[A-Za-z0-9]';
00044 $space = '[\x09\x0a\x0d\x20]';
00045 define( 'MW_ATTRIBS_REGEX',
00046 "/(?:^|$space)($attrib+)
00047 ($space*=$space*
00048 (?:
00049 # The attribute value: quoted or alone
00050 \"([^<\"]*)\"
00051 | '([^<']*)'
00052 | ([a-zA-Z0-9!#$%&()*,\\-.\\/:;<>?@[\\]^_`{|}~]+)
00053 | (\#[0-9a-fA-F]+) # Technically wrong, but lots of
00054 # colors are specified like this.
00055 # We'll be normalizing it.
00056 )
00057 )?(?=$space|\$)/sx" );
00058
00064 global $wgHtmlEntities;
00065 $wgHtmlEntities = array(
00066 'Aacute' => 193,
00067 'aacute' => 225,
00068 'Acirc' => 194,
00069 'acirc' => 226,
00070 'acute' => 180,
00071 'AElig' => 198,
00072 'aelig' => 230,
00073 'Agrave' => 192,
00074 'agrave' => 224,
00075 'alefsym' => 8501,
00076 'Alpha' => 913,
00077 'alpha' => 945,
00078 'amp' => 38,
00079 'and' => 8743,
00080 'ang' => 8736,
00081 'Aring' => 197,
00082 'aring' => 229,
00083 'asymp' => 8776,
00084 'Atilde' => 195,
00085 'atilde' => 227,
00086 'Auml' => 196,
00087 'auml' => 228,
00088 'bdquo' => 8222,
00089 'Beta' => 914,
00090 'beta' => 946,
00091 'brvbar' => 166,
00092 'bull' => 8226,
00093 'cap' => 8745,
00094 'Ccedil' => 199,
00095 'ccedil' => 231,
00096 'cedil' => 184,
00097 'cent' => 162,
00098 'Chi' => 935,
00099 'chi' => 967,
00100 'circ' => 710,
00101 'clubs' => 9827,
00102 'cong' => 8773,
00103 'copy' => 169,
00104 'crarr' => 8629,
00105 'cup' => 8746,
00106 'curren' => 164,
00107 'dagger' => 8224,
00108 'Dagger' => 8225,
00109 'darr' => 8595,
00110 'dArr' => 8659,
00111 'deg' => 176,
00112 'Delta' => 916,
00113 'delta' => 948,
00114 'diams' => 9830,
00115 'divide' => 247,
00116 'Eacute' => 201,
00117 'eacute' => 233,
00118 'Ecirc' => 202,
00119 'ecirc' => 234,
00120 'Egrave' => 200,
00121 'egrave' => 232,
00122 'empty' => 8709,
00123 'emsp' => 8195,
00124 'ensp' => 8194,
00125 'Epsilon' => 917,
00126 'epsilon' => 949,
00127 'equiv' => 8801,
00128 'Eta' => 919,
00129 'eta' => 951,
00130 'ETH' => 208,
00131 'eth' => 240,
00132 'Euml' => 203,
00133 'euml' => 235,
00134 'euro' => 8364,
00135 'exist' => 8707,
00136 'fnof' => 402,
00137 'forall' => 8704,
00138 'frac12' => 189,
00139 'frac14' => 188,
00140 'frac34' => 190,
00141 'frasl' => 8260,
00142 'Gamma' => 915,
00143 'gamma' => 947,
00144 'ge' => 8805,
00145 'gt' => 62,
00146 'harr' => 8596,
00147 'hArr' => 8660,
00148 'hearts' => 9829,
00149 'hellip' => 8230,
00150 'Iacute' => 205,
00151 'iacute' => 237,
00152 'Icirc' => 206,
00153 'icirc' => 238,
00154 'iexcl' => 161,
00155 'Igrave' => 204,
00156 'igrave' => 236,
00157 'image' => 8465,
00158 'infin' => 8734,
00159 'int' => 8747,
00160 'Iota' => 921,
00161 'iota' => 953,
00162 'iquest' => 191,
00163 'isin' => 8712,
00164 'Iuml' => 207,
00165 'iuml' => 239,
00166 'Kappa' => 922,
00167 'kappa' => 954,
00168 'Lambda' => 923,
00169 'lambda' => 955,
00170 'lang' => 9001,
00171 'laquo' => 171,
00172 'larr' => 8592,
00173 'lArr' => 8656,
00174 'lceil' => 8968,
00175 'ldquo' => 8220,
00176 'le' => 8804,
00177 'lfloor' => 8970,
00178 'lowast' => 8727,
00179 'loz' => 9674,
00180 'lrm' => 8206,
00181 'lsaquo' => 8249,
00182 'lsquo' => 8216,
00183 'lt' => 60,
00184 'macr' => 175,
00185 'mdash' => 8212,
00186 'micro' => 181,
00187 'middot' => 183,
00188 'minus' => 8722,
00189 'Mu' => 924,
00190 'mu' => 956,
00191 'nabla' => 8711,
00192 'nbsp' => 160,
00193 'ndash' => 8211,
00194 'ne' => 8800,
00195 'ni' => 8715,
00196 'not' => 172,
00197 'notin' => 8713,
00198 'nsub' => 8836,
00199 'Ntilde' => 209,
00200 'ntilde' => 241,
00201 'Nu' => 925,
00202 'nu' => 957,
00203 'Oacute' => 211,
00204 'oacute' => 243,
00205 'Ocirc' => 212,
00206 'ocirc' => 244,
00207 'OElig' => 338,
00208 'oelig' => 339,
00209 'Ograve' => 210,
00210 'ograve' => 242,
00211 'oline' => 8254,
00212 'Omega' => 937,
00213 'omega' => 969,
00214 'Omicron' => 927,
00215 'omicron' => 959,
00216 'oplus' => 8853,
00217 'or' => 8744,
00218 'ordf' => 170,
00219 'ordm' => 186,
00220 'Oslash' => 216,
00221 'oslash' => 248,
00222 'Otilde' => 213,
00223 'otilde' => 245,
00224 'otimes' => 8855,
00225 'Ouml' => 214,
00226 'ouml' => 246,
00227 'para' => 182,
00228 'part' => 8706,
00229 'permil' => 8240,
00230 'perp' => 8869,
00231 'Phi' => 934,
00232 'phi' => 966,
00233 'Pi' => 928,
00234 'pi' => 960,
00235 'piv' => 982,
00236 'plusmn' => 177,
00237 'pound' => 163,
00238 'prime' => 8242,
00239 'Prime' => 8243,
00240 'prod' => 8719,
00241 'prop' => 8733,
00242 'Psi' => 936,
00243 'psi' => 968,
00244 'quot' => 34,
00245 'radic' => 8730,
00246 'rang' => 9002,
00247 'raquo' => 187,
00248 'rarr' => 8594,
00249 'rArr' => 8658,
00250 'rceil' => 8969,
00251 'rdquo' => 8221,
00252 'real' => 8476,
00253 'reg' => 174,
00254 'rfloor' => 8971,
00255 'Rho' => 929,
00256 'rho' => 961,
00257 'rlm' => 8207,
00258 'rsaquo' => 8250,
00259 'rsquo' => 8217,
00260 'sbquo' => 8218,
00261 'Scaron' => 352,
00262 'scaron' => 353,
00263 'sdot' => 8901,
00264 'sect' => 167,
00265 'shy' => 173,
00266 'Sigma' => 931,
00267 'sigma' => 963,
00268 'sigmaf' => 962,
00269 'sim' => 8764,
00270 'spades' => 9824,
00271 'sub' => 8834,
00272 'sube' => 8838,
00273 'sum' => 8721,
00274 'sup' => 8835,
00275 'sup1' => 185,
00276 'sup2' => 178,
00277 'sup3' => 179,
00278 'supe' => 8839,
00279 'szlig' => 223,
00280 'Tau' => 932,
00281 'tau' => 964,
00282 'there4' => 8756,
00283 'Theta' => 920,
00284 'theta' => 952,
00285 'thetasym' => 977,
00286 'thinsp' => 8201,
00287 'THORN' => 222,
00288 'thorn' => 254,
00289 'tilde' => 732,
00290 'times' => 215,
00291 'trade' => 8482,
00292 'Uacute' => 218,
00293 'uacute' => 250,
00294 'uarr' => 8593,
00295 'uArr' => 8657,
00296 'Ucirc' => 219,
00297 'ucirc' => 251,
00298 'Ugrave' => 217,
00299 'ugrave' => 249,
00300 'uml' => 168,
00301 'upsih' => 978,
00302 'Upsilon' => 933,
00303 'upsilon' => 965,
00304 'Uuml' => 220,
00305 'uuml' => 252,
00306 'weierp' => 8472,
00307 'Xi' => 926,
00308 'xi' => 958,
00309 'Yacute' => 221,
00310 'yacute' => 253,
00311 'yen' => 165,
00312 'Yuml' => 376,
00313 'yuml' => 255,
00314 'Zeta' => 918,
00315 'zeta' => 950,
00316 'zwj' => 8205,
00317 'zwnj' => 8204 );
00318
00322 global $wgHtmlEntityAliases;
00323 $wgHtmlEntityAliases = array(
00324 'רלמ' => 'rlm',
00325 'رلم' => 'rlm',
00326 );
00327
00328
00333 class Sanitizer {
00343 static function removeHTMLtags( $text, $processCallback = null, $args = array(), $extratags = array() ) {
00344 global $wgUseTidy;
00345
00346 static $htmlpairs, $htmlsingle, $htmlsingleonly, $htmlnest, $tabletags,
00347 $htmllist, $listtags, $htmlsingleallowed, $htmlelements, $staticInitialised;
00348
00349 wfProfileIn( __METHOD__ );
00350
00351 if ( !$staticInitialised ) {
00352
00353 $htmlpairs = array_merge( $extratags, array( # Tags that must be closed
00354 'b', 'del', 'i', 'ins', 'u', 'font', 'big', 'small', 'sub', 'sup', 'h1',
00355 'h2', 'h3', 'h4', 'h5', 'h6', 'cite', 'code', 'em', 's',
00356 'strike', 'strong', 'tt', 'var', 'div', 'center',
00357 'blockquote', 'ol', 'ul', 'dl', 'table', 'caption', 'pre',
00358 'ruby', 'rt' , 'rb' , 'rp', 'p', 'span', 'u'
00359 ) );
00360 $htmlsingle = array(
00361 'br', 'hr', 'li', 'dt', 'dd'
00362 );
00363 $htmlsingleonly = array( # Elements that cannot have close tags
00364 'br', 'hr'
00365 );
00366 $htmlnest = array( # Tags that can be nested--??
00367 'table', 'tr', 'td', 'th', 'div', 'blockquote', 'ol', 'ul',
00368 'dl', 'font', 'big', 'small', 'sub', 'sup', 'span'
00369 );
00370 $tabletags = array( # Can only appear inside table, we will close them
00371 'td', 'th', 'tr',
00372 );
00373 $htmllist = array( # Tags used by list
00374 'ul','ol',
00375 );
00376 $listtags = array( # Tags that can appear in a list
00377 'li',
00378 );
00379
00380 $htmlsingleallowed = array_merge( $htmlsingle, $tabletags );
00381 $htmlelements = array_merge( $htmlsingle, $htmlpairs, $htmlnest );
00382
00383 # Convert them all to hashtables for faster lookup
00384 $vars = array( 'htmlpairs', 'htmlsingle', 'htmlsingleonly', 'htmlnest', 'tabletags',
00385 'htmllist', 'listtags', 'htmlsingleallowed', 'htmlelements' );
00386 foreach ( $vars as $var ) {
00387 $$var = array_flip( $$var );
00388 }
00389 $staticInitialised = true;
00390 }
00391
00392 # Remove HTML comments
00393 $text = Sanitizer::removeHTMLcomments( $text );
00394 $bits = explode( '<', $text );
00395 $text = str_replace( '>', '>', array_shift( $bits ) );
00396 if(!$wgUseTidy) {
00397 $tagstack = $tablestack = array();
00398 foreach ( $bits as $x ) {
00399 $regs = array();
00400 if( preg_match( '!^(/?)(\\w+)([^>]*?)(/{0,1}>)([^<]*)$!', $x, $regs ) ) {
00401 list( , $slash, $t, $params, $brace, $rest ) = $regs;
00402 } else {
00403 $slash = $t = $params = $brace = $rest = null;
00404 }
00405
00406 $badtag = 0 ;
00407 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00408 # Check our stack
00409 if ( $slash ) {
00410 # Closing a tag...
00411 if( isset( $htmlsingleonly[$t] ) ) {
00412 $badtag = 1;
00413 } elseif ( ( $ot = @array_pop( $tagstack ) ) != $t ) {
00414 if ( isset( $htmlsingleallowed[$ot] ) ) {
00415 # Pop all elements with an optional close tag
00416 # and see if we find a match below them
00417 $optstack = array();
00418 array_push ($optstack, $ot);
00419 while ( ( ( $ot = @array_pop( $tagstack ) ) != $t ) &&
00420 isset( $htmlsingleallowed[$ot] ) )
00421 {
00422 array_push ($optstack, $ot);
00423 }
00424 if ( $t != $ot ) {
00425 # No match. Push the optinal elements back again
00426 $badtag = 1;
00427 while ( $ot = @array_pop( $optstack ) ) {
00428 array_push( $tagstack, $ot );
00429 }
00430 }
00431 } else {
00432 @array_push( $tagstack, $ot );
00433 # <li> can be nested in <ul> or <ol>, skip those cases:
00434 if(!(isset( $htmllist[$ot] ) && isset( $listtags[$t] ) )) {
00435 $badtag = 1;
00436 }
00437 }
00438 } else {
00439 if ( $t == 'table' ) {
00440 $tagstack = array_pop( $tablestack );
00441 }
00442 }
00443 $newparams = '';
00444 } else {
00445 # Keep track for later
00446 if ( isset( $tabletags[$t] ) &&
00447 ! in_array( 'table', $tagstack ) ) {
00448 $badtag = 1;
00449 } else if ( in_array( $t, $tagstack ) &&
00450 ! isset( $htmlnest [$t ] ) ) {
00451 $badtag = 1 ;
00452 # Is it a self closed htmlpair ? (bug 5487)
00453 } else if( $brace == '/>' &&
00454 isset( $htmlpairs[$t] ) ) {
00455 $badtag = 1;
00456 } elseif( isset( $htmlsingleonly[$t] ) ) {
00457 # Hack to force empty tag for uncloseable elements
00458 $brace = '/>';
00459 } else if( isset( $htmlsingle[$t] ) ) {
00460 # Hack to not close $htmlsingle tags
00461 $brace = NULL;
00462 } else if( isset( $tabletags[$t] )
00463 && in_array($t ,$tagstack) ) {
00464
00465 $text .= "</$t>";
00466 } else {
00467 if ( $t == 'table' ) {
00468 array_push( $tablestack, $tagstack );
00469 $tagstack = array();
00470 }
00471 array_push( $tagstack, $t );
00472 }
00473
00474 # Replace any variables or template parameters with
00475 # plaintext results.
00476 if( is_callable( $processCallback ) ) {
00477 call_user_func_array( $processCallback, array( &$params, $args ) );
00478 }
00479
00480 # Strip non-approved attributes from the tag
00481 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00482 }
00483 if ( ! $badtag ) {
00484 $rest = str_replace( '>', '>', $rest );
00485 $close = ( $brace == '/>' && !$slash ) ? ' /' : '';
00486 $text .= "<$slash$t$newparams$close>$rest";
00487 continue;
00488 }
00489 }
00490 $text .= '<' . str_replace( '>', '>', $x);
00491 }
00492 # Close off any remaining tags
00493 while ( is_array( $tagstack ) && ($t = array_pop( $tagstack )) ) {
00494 $text .= "</$t>\n";
00495 if ( $t == 'table' ) { $tagstack = array_pop( $tablestack ); }
00496 }
00497 } else {
00498 # this might be possible using tidy itself
00499 foreach ( $bits as $x ) {
00500 preg_match( '/^(\\/?)(\\w+)([^>]*?)(\\/{0,1}>)([^<]*)$/',
00501 $x, $regs );
00502 @list( , $slash, $t, $params, $brace, $rest ) = $regs;
00503 if ( isset( $htmlelements[$t = strtolower( $t )] ) ) {
00504 if( is_callable( $processCallback ) ) {
00505 call_user_func_array( $processCallback, array( &$params, $args ) );
00506 }
00507 $newparams = Sanitizer::fixTagAttributes( $params, $t );
00508 $rest = str_replace( '>', '>', $rest );
00509 $text .= "<$slash$t$newparams$brace$rest";
00510 } else {
00511 $text .= '<' . str_replace( '>', '>', $x);
00512 }
00513 }
00514 }
00515 wfProfileOut( __METHOD__ );
00516 return $text;
00517 }
00518
00529 static function removeHTMLcomments( $text ) {
00530 wfProfileIn( __METHOD__ );
00531 while (($start = strpos($text, '<!--')) !== false) {
00532 $end = strpos($text, '-->', $start + 4);
00533 if ($end === false) {
00534 # Unterminated comment; bail out
00535 break;
00536 }
00537
00538 $end += 3;
00539
00540 # Trim space and newline if the comment is both
00541 # preceded and followed by a newline
00542 $spaceStart = max($start - 1, 0);
00543 $spaceLen = $end - $spaceStart;
00544 while (substr($text, $spaceStart, 1) === ' ' && $spaceStart > 0) {
00545 $spaceStart--;
00546 $spaceLen++;
00547 }
00548 while (substr($text, $spaceStart + $spaceLen, 1) === ' ')
00549 $spaceLen++;
00550 if (substr($text, $spaceStart, 1) === "\n" and substr($text, $spaceStart + $spaceLen, 1) === "\n") {
00551 # Remove the comment, leading and trailing
00552 # spaces, and leave only one newline.
00553 $text = substr_replace($text, "\n", $spaceStart, $spaceLen + 1);
00554 }
00555 else {
00556 # Remove just the comment.
00557 $text = substr_replace($text, '', $start, $end - $start);
00558 }
00559 }
00560 wfProfileOut( __METHOD__ );
00561 return $text;
00562 }
00563
00579 static function validateTagAttributes( $attribs, $element ) {
00580 return Sanitizer::validateAttributes( $attribs,
00581 Sanitizer::attributeWhitelist( $element ) );
00582 }
00583
00599 static function validateAttributes( $attribs, $whitelist ) {
00600 $whitelist = array_flip( $whitelist );
00601 $out = array();
00602 foreach( $attribs as $attribute => $value ) {
00603 if( !isset( $whitelist[$attribute] ) ) {
00604 continue;
00605 }
00606 # Strip javascript "expression" from stylesheets.
00607 # http://msdn.microsoft.com/workshop/author/dhtml/overview/recalc.asp
00608 if( $attribute == 'style' ) {
00609 $value = Sanitizer::checkCss( $value );
00610 if( $value === false ) {
00611 # haxx0r
00612 continue;
00613 }
00614 }
00615
00616 if ( $attribute === 'id' ) {
00617 global $wgEnforceHtmlIds;
00618 $value = Sanitizer::escapeId( $value,
00619 $wgEnforceHtmlIds ? 'noninitial' : 'xml' );
00620 }
00621
00622
00623
00624 $out[$attribute] = $value;
00625 }
00626 return $out;
00627 }
00628
00639 static function mergeAttributes( $a, $b ) {
00640 $out = array_merge( $a, $b );
00641 if( isset( $a['class'] ) && isset( $b['class'] )
00642 && is_string( $a['class'] ) && is_string( $b['class'] )
00643 && $a['class'] !== $b['class'] ) {
00644 $classes = preg_split( '/\s+/', "{$a['class']} {$b['class']}",
00645 -1, PREG_SPLIT_NO_EMPTY );
00646 $out['class'] = implode( ' ', array_unique( $classes ) );
00647 }
00648 return $out;
00649 }
00650
00660 static function checkCss( $value ) {
00661 $stripped = Sanitizer::decodeCharReferences( $value );
00662
00663
00664 $stripped = StringUtils::delimiterReplace( '/*', '*/', ' ', $stripped );
00665
00666 $value = $stripped;
00667
00668
00669 $stripped = preg_replace( '!\\\\([0-9A-Fa-f]{1,6})[ \\n\\r\\t\\f]?!e',
00670 'codepointToUtf8(hexdec("$1"))', $stripped );
00671 $stripped = str_replace( '\\', '', $stripped );
00672 if( preg_match( '/(?:expression|tps*:\/\/|url\\s*\().*/is',
00673 $stripped ) ) {
00674 # haxx0r
00675 return false;
00676 }
00677
00678 return $value;
00679 }
00680
00700 static function fixTagAttributes( $text, $element ) {
00701 if( trim( $text ) == '' ) {
00702 return '';
00703 }
00704
00705 $stripped = Sanitizer::validateTagAttributes(
00706 Sanitizer::decodeTagAttributes( $text ), $element );
00707
00708 $attribs = array();
00709 foreach( $stripped as $attribute => $value ) {
00710 $encAttribute = htmlspecialchars( $attribute );
00711 $encValue = Sanitizer::safeEncodeAttribute( $value );
00712
00713 $attribs[] = "$encAttribute=\"$encValue\"";
00714 }
00715 return count( $attribs ) ? ' ' . implode( ' ', $attribs ) : '';
00716 }
00717
00723 static function encodeAttribute( $text ) {
00724 $encValue = htmlspecialchars( $text, ENT_QUOTES );
00725
00726
00727
00728
00729 $encValue = strtr( $encValue, array(
00730 "\n" => ' ',
00731 "\r" => ' ',
00732 "\t" => '	',
00733 ) );
00734
00735 return $encValue;
00736 }
00737
00744 static function safeEncodeAttribute( $text ) {
00745 $encValue = Sanitizer::encodeAttribute( $text );
00746
00747 # Templates and links may be expanded in later parsing,
00748 # creating invalid or dangerous output. Suppress this.
00749 $encValue = strtr( $encValue, array(
00750 '<' => '<',
00751 '>' => '>',
00752 '"' => '"',
00753 '{' => '{',
00754 '[' => '[',
00755 "''" => '''',
00756 'ISBN' => 'ISBN',
00757 'RFC' => 'RFC',
00758 'PMID' => 'PMID',
00759 '|' => '|',
00760 '__' => '__',
00761 ) );
00762
00763 # Stupid hack
00764 $encValue = preg_replace_callback(
00765 '/(' . wfUrlProtocols() . ')/',
00766 array( 'Sanitizer', 'armorLinksCallback' ),
00767 $encValue );
00768 return $encValue;
00769 }
00770
00792 static function escapeId( $id, $options = array() ) {
00793 $options = (array)$options;
00794
00795 if ( !in_array( 'xml', $options ) ) {
00796 # HTML4-style escaping
00797 static $replace = array(
00798 '%3A' => ':',
00799 '%' => '.'
00800 );
00801
00802 $id = urlencode( Sanitizer::decodeCharReferences( strtr( $id, ' ', '_' ) ) );
00803 $id = str_replace( array_keys( $replace ), array_values( $replace ), $id );
00804
00805 if ( !preg_match( '/^[a-zA-Z]/', $id )
00806 && !in_array( 'noninitial', $options ) ) {
00807
00808 $id = "x$id";
00809 }
00810 return $id;
00811 }
00812
00813 # XML-style escaping. For the patterns used, see the XML 1.0 standard,
00814 # 5th edition, NameStartChar and NameChar: <http://www.w3.org/TR/REC-xml/>
00815 $nameStartChar = ':a-zA-Z_\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}'
00816 . '\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}'
00817 . '\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}\x{10000}-\x{EFFFF}';
00818 $nameChar = $nameStartChar . '.\-0-9\xB7\x{0300}-\x{036F}'
00819 . '\x{203F}-\x{2040}';
00820 # Replace _ as well so we don't get multiple consecutive underscores
00821 $id = preg_replace( "/([^$nameChar]|_)+/u", '_', $id );
00822 $id = trim( $id, '_' );
00823
00824 if ( !preg_match( "/^[$nameStartChar]/u", $id )
00825 && !in_array( 'noninitial', $options ) ) {
00826 $id = "_$id";
00827 }
00828
00829 return $id;
00830 }
00831
00843 static function escapeClass( $class ) {
00844
00845 return rtrim(preg_replace(
00846 array('/(^[0-9\\-])|[\\x00-\\x20!"#$%&\'()*+,.\\/:;<=>?@[\\]^`{|}~]|\\xC2\\xA0/','/_+/'),
00847 '_',
00848 $class ), '_');
00849 }
00850
00858 static function escapeHtmlAllowEntities( $html ) {
00859 # It seems wise to escape ' as well as ", as a matter of course. Can't
00860 # hurt.
00861 $html = htmlspecialchars( $html, ENT_QUOTES );
00862 $html = str_replace( '&', '&', $html );
00863 $html = Sanitizer::normalizeCharReferences( $html );
00864 return $html;
00865 }
00866
00873 private static function armorLinksCallback( $matches ) {
00874 return str_replace( ':', ':', $matches[1] );
00875 }
00876
00885 public static function decodeTagAttributes( $text ) {
00886 $attribs = array();
00887
00888 if( trim( $text ) == '' ) {
00889 return $attribs;
00890 }
00891
00892 $pairs = array();
00893 if( !preg_match_all(
00894 MW_ATTRIBS_REGEX,
00895 $text,
00896 $pairs,
00897 PREG_SET_ORDER ) ) {
00898 return $attribs;
00899 }
00900
00901 foreach( $pairs as $set ) {
00902 $attribute = strtolower( $set[1] );
00903 $value = Sanitizer::getTagAttributeCallback( $set );
00904
00905
00906 $value = preg_replace( '/[\t\r\n ]+/', ' ', $value );
00907 $value = trim( $value );
00908
00909
00910 $attribs[$attribute] = Sanitizer::decodeCharReferences( $value );
00911 }
00912 return $attribs;
00913 }
00914
00923 private static function getTagAttributeCallback( $set ) {
00924 if( isset( $set[6] ) ) {
00925 # Illegal #XXXXXX color with no quotes.
00926 return $set[6];
00927 } elseif( isset( $set[5] ) ) {
00928 # No quotes.
00929 return $set[5];
00930 } elseif( isset( $set[4] ) ) {
00931 # Single-quoted
00932 return $set[4];
00933 } elseif( isset( $set[3] ) ) {
00934 # Double-quoted
00935 return $set[3];
00936 } elseif( !isset( $set[2] ) ) {
00937 # In XHTML, attributes must have a value.
00938 # For 'reduced' form, return explicitly the attribute name here.
00939 return $set[1];
00940 } else {
00941 throw new MWException( "Tag conditions not met. This should never happen and is a bug." );
00942 }
00943 }
00944
00957 private static function normalizeAttributeValue( $text ) {
00958 return str_replace( '"', '"',
00959 self::normalizeWhitespace(
00960 Sanitizer::normalizeCharReferences( $text ) ) );
00961 }
00962
00963 private static function normalizeWhitespace( $text ) {
00964 return preg_replace(
00965 '/\r\n|[\x20\x0d\x0a\x09]/',
00966 ' ',
00967 $text );
00968 }
00969
00984 static function normalizeCharReferences( $text ) {
00985 return preg_replace_callback(
00986 MW_CHAR_REFS_REGEX,
00987 array( 'Sanitizer', 'normalizeCharReferencesCallback' ),
00988 $text );
00989 }
00994 static function normalizeCharReferencesCallback( $matches ) {
00995 $ret = null;
00996 if( $matches[1] != '' ) {
00997 $ret = Sanitizer::normalizeEntity( $matches[1] );
00998 } elseif( $matches[2] != '' ) {
00999 $ret = Sanitizer::decCharReference( $matches[2] );
01000 } elseif( $matches[3] != '' ) {
01001 $ret = Sanitizer::hexCharReference( $matches[3] );
01002 } elseif( $matches[4] != '' ) {
01003 $ret = Sanitizer::hexCharReference( $matches[4] );
01004 }
01005 if( is_null( $ret ) ) {
01006 return htmlspecialchars( $matches[0] );
01007 } else {
01008 return $ret;
01009 }
01010 }
01011
01022 static function normalizeEntity( $name ) {
01023 global $wgHtmlEntities, $wgHtmlEntityAliases;
01024 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
01025 return "&{$wgHtmlEntityAliases[$name]};";
01026 } elseif( isset( $wgHtmlEntities[$name] ) ) {
01027 return "&$name;";
01028 } else {
01029 return "&$name;";
01030 }
01031 }
01032
01033 static function decCharReference( $codepoint ) {
01034 $point = intval( $codepoint );
01035 if( Sanitizer::validateCodepoint( $point ) ) {
01036 return sprintf( '&#%d;', $point );
01037 } else {
01038 return null;
01039 }
01040 }
01041
01042 static function hexCharReference( $codepoint ) {
01043 $point = hexdec( $codepoint );
01044 if( Sanitizer::validateCodepoint( $point ) ) {
01045 return sprintf( '&#x%x;', $point );
01046 } else {
01047 return null;
01048 }
01049 }
01050
01056 private static function validateCodepoint( $codepoint ) {
01057 return ($codepoint == 0x09)
01058 || ($codepoint == 0x0a)
01059 || ($codepoint == 0x0d)
01060 || ($codepoint >= 0x20 && $codepoint <= 0xd7ff)
01061 || ($codepoint >= 0xe000 && $codepoint <= 0xfffd)
01062 || ($codepoint >= 0x10000 && $codepoint <= 0x10ffff);
01063 }
01064
01074 public static function decodeCharReferences( $text ) {
01075 return preg_replace_callback(
01076 MW_CHAR_REFS_REGEX,
01077 array( 'Sanitizer', 'decodeCharReferencesCallback' ),
01078 $text );
01079 }
01080
01085 static function decodeCharReferencesCallback( $matches ) {
01086 if( $matches[1] != '' ) {
01087 return Sanitizer::decodeEntity( $matches[1] );
01088 } elseif( $matches[2] != '' ) {
01089 return Sanitizer::decodeChar( intval( $matches[2] ) );
01090 } elseif( $matches[3] != '' ) {
01091 return Sanitizer::decodeChar( hexdec( $matches[3] ) );
01092 } elseif( $matches[4] != '' ) {
01093 return Sanitizer::decodeChar( hexdec( $matches[4] ) );
01094 }
01095 # Last case should be an ampersand by itself
01096 return $matches[0];
01097 }
01098
01106 static function decodeChar( $codepoint ) {
01107 if( Sanitizer::validateCodepoint( $codepoint ) ) {
01108 return codepointToUtf8( $codepoint );
01109 } else {
01110 return UTF8_REPLACEMENT;
01111 }
01112 }
01113
01122 static function decodeEntity( $name ) {
01123 global $wgHtmlEntities, $wgHtmlEntityAliases;
01124 if ( isset( $wgHtmlEntityAliases[$name] ) ) {
01125 $name = $wgHtmlEntityAliases[$name];
01126 }
01127 if( isset( $wgHtmlEntities[$name] ) ) {
01128 return codepointToUtf8( $wgHtmlEntities[$name] );
01129 } else {
01130 return "&$name;";
01131 }
01132 }
01133
01141 static function attributeWhitelist( $element ) {
01142 static $list;
01143 if( !isset( $list ) ) {
01144 $list = Sanitizer::setupAttributeWhitelist();
01145 }
01146 return isset( $list[$element] )
01147 ? $list[$element]
01148 : array();
01149 }
01150
01156 static function setupAttributeWhitelist() {
01157 $common = array( 'id', 'class', 'lang', 'dir', 'title', 'style' );
01158 $block = array_merge( $common, array( 'align' ) );
01159 $tablealign = array( 'align', 'char', 'charoff', 'valign' );
01160 $tablecell = array( 'abbr',
01161 'axis',
01162 'headers',
01163 'scope',
01164 'rowspan',
01165 'colspan',
01166 'nowrap', # deprecated
01167 'width', # deprecated
01168 'height', # deprecated
01169 'bgcolor' # deprecated
01170 );
01171
01172 # Numbers refer to sections in HTML 4.01 standard describing the element.
01173 # See: http://www.w3.org/TR/html4/
01174 $whitelist = array (
01175 # 7.5.4
01176 'div' => $block,
01177 'center' => $common, # deprecated
01178 'span' => $block, # ??
01179
01180 # 7.5.5
01181 'h1' => $block,
01182 'h2' => $block,
01183 'h3' => $block,
01184 'h4' => $block,
01185 'h5' => $block,
01186 'h6' => $block,
01187
01188 # 7.5.6
01189 # address
01190
01191 # 8.2.4
01192 # bdo
01193
01194 # 9.2.1
01195 'em' => $common,
01196 'strong' => $common,
01197 'cite' => $common,
01198 # dfn
01199 'code' => $common,
01200 # samp
01201 # kbd
01202 'var' => $common,
01203 # abbr
01204 # acronym
01205
01206 # 9.2.2
01207 'blockquote' => array_merge( $common, array( 'cite' ) ),
01208 # q
01209
01210 # 9.2.3
01211 'sub' => $common,
01212 'sup' => $common,
01213
01214 # 9.3.1
01215 'p' => $block,
01216
01217 # 9.3.2
01218 'br' => array( 'id', 'class', 'title', 'style', 'clear' ),
01219
01220 # 9.3.4
01221 'pre' => array_merge( $common, array( 'width' ) ),
01222
01223 # 9.4
01224 'ins' => array_merge( $common, array( 'cite', 'datetime' ) ),
01225 'del' => array_merge( $common, array( 'cite', 'datetime' ) ),
01226
01227 # 10.2
01228 'ul' => array_merge( $common, array( 'type' ) ),
01229 'ol' => array_merge( $common, array( 'type', 'start' ) ),
01230 'li' => array_merge( $common, array( 'type', 'value' ) ),
01231
01232 # 10.3
01233 'dl' => $common,
01234 'dd' => $common,
01235 'dt' => $common,
01236
01237 # 11.2.1
01238 'table' => array_merge( $common,
01239 array( 'summary', 'width', 'border', 'frame',
01240 'rules', 'cellspacing', 'cellpadding',
01241 'align', 'bgcolor',
01242 ) ),
01243
01244 # 11.2.2
01245 'caption' => array_merge( $common, array( 'align' ) ),
01246
01247 # 11.2.3
01248 'thead' => array_merge( $common, $tablealign ),
01249 'tfoot' => array_merge( $common, $tablealign ),
01250 'tbody' => array_merge( $common, $tablealign ),
01251
01252 # 11.2.4
01253 'colgroup' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01254 'col' => array_merge( $common, array( 'span', 'width' ), $tablealign ),
01255
01256 # 11.2.5
01257 'tr' => array_merge( $common, array( 'bgcolor' ), $tablealign ),
01258
01259 # 11.2.6
01260 'td' => array_merge( $common, $tablecell, $tablealign ),
01261 'th' => array_merge( $common, $tablecell, $tablealign ),
01262
01263 # 13.2
01264 # Not usually allowed, but may be used for extension-style hooks
01265 # such as <math> when it is rasterized
01266 'img' => array_merge( $common, array( 'alt' ) ),
01267
01268 # 15.2.1
01269 'tt' => $common,
01270 'b' => $common,
01271 'i' => $common,
01272 'big' => $common,
01273 'small' => $common,
01274 'strike' => $common,
01275 's' => $common,
01276 'u' => $common,
01277
01278 # 15.2.2
01279 'font' => array_merge( $common, array( 'size', 'color', 'face' ) ),
01280 # basefont
01281
01282 # 15.3
01283 'hr' => array_merge( $common, array( 'noshade', 'size', 'width' ) ),
01284
01285 # XHTML Ruby annotation text module, simple ruby only.
01286 # http:
01287 'ruby' => $common,
01288 # rbc
01289 # rtc
01290 'rb' => $common,
01291 'rt' => $common, #array_merge( $common, array( 'rbspan' ) ),
01292 'rp' => $common,
01293
01294 # MathML root element, where used for extensions
01295 # 'title' may not be 100% valid here; it's XHTML
01296 # http://www.w3.org/TR/REC-MathML/
01297 'math' => array( 'class', 'style', 'id', 'title' ),
01298 );
01299 return $whitelist;
01300 }
01301
01312 static function stripAllTags( $text ) {
01313 # Actual <tags>
01314 $text = StringUtils::delimiterReplace( '<', '>', '', $text );
01315
01316 # Normalize &entities and whitespace
01317 $text = self::decodeCharReferences( $text );
01318 $text = self::normalizeWhitespace( $text );
01319
01320 return $text;
01321 }
01322
01333 static function hackDocType() {
01334 global $wgHtmlEntities;
01335 $out = "<!DOCTYPE html [\n";
01336 foreach( $wgHtmlEntities as $entity => $codepoint ) {
01337 $out .= "<!ENTITY $entity \"&#$codepoint;\">";
01338 }
01339 $out .= "]>\n";
01340 return $out;
01341 }
01342
01343 static function cleanUrl( $url ) {
01344 # Normalize any HTML entities in input. They will be
01345 # re-escaped by makeExternalLink().
01346 $url = Sanitizer::decodeCharReferences( $url );
01347
01348 # Escape any control characters introduced by the above step
01349 $url = preg_replace( '/[\][<>"\\x00-\\x20\\x7F]/e', "urlencode('\\0')", $url );
01350
01351 # Validate hostname portion
01352 $matches = array();
01353 if( preg_match( '!^([^:]+:)(//[^/]+)?(.*)$!iD', $url, $matches ) ) {
01354 list( /* $whole */, $protocol, $host, $rest ) = $matches;
01355
01356 // Characters that will be ignored in IDNs.
01357 // http://tools.ietf.org/html/3454#section-3.1
01358 // Strip them before further processing so blacklists and such work.
01359 $strip = "/
01360 \\s| # general whitespace
01361 \xc2\xad| # 00ad SOFT HYPHEN
01362 \xe1\xa0\x86| # 1806 MONGOLIAN TODO SOFT HYPHEN
01363 \xe2\x80\x8b| # 200b ZERO WIDTH SPACE
01364 \xe2\x81\xa0| # 2060 WORD JOINER
01365 \xef\xbb\xbf| # feff ZERO WIDTH NO-BREAK SPACE
01366 \xcd\x8f| # 034f COMBINING GRAPHEME JOINER
01367 \xe1\xa0\x8b| # 180b MONGOLIAN FREE VARIATION SELECTOR ONE
01368 \xe1\xa0\x8c| # 180c MONGOLIAN FREE VARIATION SELECTOR TWO
01369 \xe1\xa0\x8d| # 180d MONGOLIAN FREE VARIATION SELECTOR THREE
01370 \xe2\x80\x8c| # 200c ZERO WIDTH NON-JOINER
01371 \xe2\x80\x8d| # 200d ZERO WIDTH JOINER
01372 [\xef\xb8\x80-\xef\xb8\x8f] # fe00-fe00f VARIATION SELECTOR-1-16
01373 /xuD";
01374
01375 $host = preg_replace( $strip, '', $host );
01376
01377 // @fixme: validate hostnames here
01378
01379 return $protocol . $host . $rest;
01380 } else {
01381 return $url;
01382 }
01383 }
01384
01385 }