00001 <?php
00002
00029 if( php_sapi_name() != 'cli' ) {
00030 echo "Please customize the settings and run me from the command line.";
00031 die( -1 );
00032 }
00033
00035 $wgImportEncoding = "CP1252";
00036 $wgRootDirectory = "/kalman/Projects/wiki2002/wiki/lib-http/db/wiki";
00037
00038
00039 @ini_set( 'memory_limit', '40M' );
00040
00041
00042 $wgFieldSeparator = "\xb3"; # Some wikis may use different char
00043 $FS = $wgFieldSeparator ;
00044 $FS1 = $FS."1" ;
00045 $FS2 = $FS."2" ;
00046 $FS3 = $FS."3" ;
00047
00048 # Unicode sanitization tools
00049 require_once( dirname( dirname( __FILE__ ) ) . '/includes/normal/UtfNormal.php' );
00050
00051 $usercache = array();
00052
00053 importPages();
00054
00055 # ------------------------------------------------------------------------------
00056
00057 function importPages()
00058 {
00059 global $wgRootDirectory;
00060
00061 $gt = '>';
00062 echo <<<END
00063 <?xml version="1.0" encoding="UTF-8" ?$gt
00064 <mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/"
00065 xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
00066 xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.1/
00067 http://www.mediawiki.org/xml/export-0.1.xsd"
00068 version="0.1"
00069 xml:lang="en">
00070 <!-- generated by importUseModWiki.php -->
00071
00072 END;
00073 $letters = array(
00074 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
00075 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
00076 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
00077 foreach( $letters as $letter ) {
00078 $dir = "$wgRootDirectory/page/$letter";
00079 if( is_dir( $dir ) )
00080 importPageDirectory( $dir );
00081 }
00082 echo <<<END
00083 </mediawiki>
00084
00085 END;
00086 }
00087
00088 function importPageDirectory( $dir, $prefix = "" )
00089 {
00090 echo "\n<!-- Checking page directory " . xmlCommentSafe( $dir ) . " -->\n";
00091 $mydir = opendir( $dir );
00092 while( $entry = readdir( $mydir ) ) {
00093 $m = array();
00094 if( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
00095 echo importPage( $prefix . $m[1] );
00096 } else {
00097 if( is_dir( "$dir/$entry" ) ) {
00098 if( $entry != '.' && $entry != '..' ) {
00099 importPageDirectory( "$dir/$entry", "$entry/" );
00100 }
00101 } else {
00102 echo "<!-- File '" . xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n";
00103 }
00104 }
00105 }
00106 }
00107
00108
00109 # ------------------------------------------------------------------------------
00110
00111
00112
00113
00114
00115 function useModFilename( $title ) {
00116 $c = substr( $title, 0, 1 );
00117 if(preg_match( '/[A-Z]/i', $c ) ) {
00118 return strtoupper( $c ) . "/$title";
00119 }
00120 return "other/$title";
00121 }
00122
00123 function fetchPage( $title )
00124 {
00125 global $FS1,$FS2,$FS3, $wgRootDirectory;
00126
00127 $fname = $wgRootDirectory . "/page/" . useModFilename( $title ) . ".db";
00128 if( !file_exists( $fname ) ) {
00129 echo "Couldn't open file '$fname' for page '$title'.\n";
00130 die( -1 );
00131 }
00132
00133 $page = splitHash( $FS1, file_get_contents( $fname ) );
00134 $section = splitHash( $FS2, $page["text_default"] );
00135 $text = splitHash( $FS3, $section["data"] );
00136
00137 return array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
00138 "minor" => $text["minor"] , "ts" => $section["ts"] ,
00139 "username" => $section["username"] , "host" => $section["host"] ) );
00140 }
00141
00142 function fetchKeptPages( $title )
00143 {
00144 global $FS1,$FS2,$FS3, $wgRootDirectory;
00145
00146 $fname = $wgRootDirectory . "/keep/" . useModFilename( $title ) . ".kp";
00147 if( !file_exists( $fname ) ) return array();
00148
00149 $keptlist = explode( $FS1, file_get_contents( $fname ) );
00150 array_shift( $keptlist ); # Drop the junk at beginning of file
00151
00152 $revisions = array();
00153 foreach( $keptlist as $rev ) {
00154 $section = splitHash( $FS2, $rev );
00155 $text = splitHash( $FS3, $section["data"] );
00156 if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
00157 array_push( $revisions, array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
00158 "minor" => $text["minor"] , "ts" => $section["ts"] ,
00159 "username" => $section["username"] , "host" => $section["host"] ) ) );
00160 } else {
00161 echo "<!-- skipped a bad old revision -->\n";
00162 }
00163 }
00164 return $revisions;
00165 }
00166
00167 function splitHash ( $sep , $str ) {
00168 $temp = explode ( $sep , $str ) ;
00169 $ret = array () ;
00170 for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
00171 $ret[$temp[$i]] = $temp[++$i] ;
00172 }
00173 return $ret ;
00174 }
00175
00176
00177
00178
00179
00180
00181 function checkUserCache( $name, $host )
00182 {
00183 global $usercache;
00184
00185 if( $name ) {
00186 if( in_array( $name, $usercache ) ) {
00187 $userid = $usercache[$name];
00188 } else {
00189 # If we haven't imported user accounts
00190 $userid = 0;
00191 }
00192 $username = str_replace( '_', ' ', $name );
00193 } else {
00194 $userid = 0;
00195 $username = $host;
00196 }
00197 return array( $userid, $username );
00198 }
00199
00200 function importPage( $title )
00201 {
00202 global $usercache;
00203
00204 echo "\n<!-- Importing page " . xmlCommentSafe( $title ) . " -->\n";
00205 $page = fetchPage( $title );
00206
00207 $newtitle = xmlsafe( str_replace( '_', ' ', recodeText( $title ) ) );
00208
00209 $munged = mungeFormat( $page->text );
00210 if( $munged != $page->text ) {
00215 $next = array2object( array(
00216 'text' => $munged,
00217 'minor' => 1,
00218 'username' => 'Conversion script',
00219 'host' => '127.0.0.1',
00220 'ts' => time(),
00221 'summary' => 'link fix',
00222 ) );
00223 $revisions = array( $page, $next );
00224 } else {
00228 $revisions = array( $page );
00229 }
00230 $xml = <<<END
00231 <page>
00232 <title>$newtitle</title>
00233
00234 END;
00235
00236 # History
00237 $revisions = array_merge( $revisions, fetchKeptPages( $title ) );
00238 if(count( $revisions ) == 0 ) {
00239 return NULL;
00240 }
00241
00242 foreach( $revisions as $rev ) {
00243 $text = xmlsafe( recodeText( $rev->text ) );
00244 $minor = ($rev->minor ? '<minor/>' : '');
00245 list( , $username ) = checkUserCache( $rev->username, $rev->host );
00246 $username = xmlsafe( recodeText( $username ) );
00247 $timestamp = xmlsafe( timestamp2ISO8601( $rev->ts ) );
00248 $comment = xmlsafe( recodeText( $rev->summary ) );
00249
00250 $xml .= <<<END
00251 <revision>
00252 <timestamp>$timestamp</timestamp>
00253 <contributor><username>$username</username></contributor>
00254 $minor
00255 <comment>$comment</comment>
00256 <text>$text</text>
00257 </revision>
00258
00259 END;
00260 }
00261 $xml .= "</page>\n\n";
00262 return $xml;
00263 }
00264
00265 # Whee!
00266 function recodeText( $string ) {
00267 global $wgImportEncoding;
00268 # For currently latin-1 wikis
00269 $string = str_replace( "\r\n", "\n", $string );
00270 $string = @iconv( $wgImportEncoding, "UTF-8", $string );
00271 $string = wfMungeToUtf8( $string ); # Any old Ӓ stuff
00272 return $string;
00273 }
00274
00275 function wfUtf8Sequence($codepoint) {
00276 if($codepoint < 0x80) return chr($codepoint);
00277 if($codepoint < 0x800) return chr($codepoint >> 6 & 0x3f | 0xc0) .
00278 chr($codepoint & 0x3f | 0x80);
00279 if($codepoint < 0x10000) return chr($codepoint >> 12 & 0x0f | 0xe0) .
00280 chr($codepoint >> 6 & 0x3f | 0x80) .
00281 chr($codepoint & 0x3f | 0x80);
00282 if($codepoint < 0x100000) return chr($codepoint >> 18 & 0x07 | 0xf0) . # Double-check this
00283 chr($codepoint >> 12 & 0x3f | 0x80) .
00284 chr($codepoint >> 6 & 0x3f | 0x80) .
00285 chr($codepoint & 0x3f | 0x80);
00286 # Doesn't yet handle outside the BMP
00287 return "&#$codepoint;";
00288 }
00289
00290 function wfMungeToUtf8($string) {
00291 $string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
00292 $string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
00293 # Should also do named entities here
00294 return $string;
00295 }
00296
00297 function timestamp2ISO8601( $ts ) {
00298 #2003-08-05T18:30:02Z
00299 return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z';
00300 }
00301
00302 function xmlsafe( $string ) {
00308 $string = UtfNormal::cleanUp( $string );
00309
00310 $string = htmlspecialchars( $string );
00311 return $string;
00312 }
00313
00314 function xmlCommentSafe( $text ) {
00315 return str_replace( '--', '\\-\\-', xmlsafe( recodeText( $text ) ) );
00316 }
00317
00318
00319 function array2object( $arr ) {
00320 $o = (object)0;
00321 foreach( $arr as $x => $y ) {
00322 $o->$x = $y;
00323 }
00324 return $o;
00325 }
00326
00327
00331 function mungeFormat( $text ) {
00332 global $nowiki;
00333 $nowiki = array();
00334 $staged = preg_replace_callback(
00335 '/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
00336 'nowikiPlaceholder', $text );
00337
00338 # This is probably not 100% correct, I'm just
00339 # glancing at the UseModWiki code.
00340 $upper = "[A-Z]";
00341 $lower = "[a-z_0-9]";
00342 $any = "[A-Za-z_0-9]";
00343 $camel = "(?:$upper+$lower+$upper+$any*)";
00344 $subpage = "(?:\\/$any+)";
00345 $substart = "(?:\\/$upper$any*)";
00346
00347 $munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
00348 '[[$1]]', $staged );
00349
00350 $final = preg_replace( '/' . preg_quote( placeholder() ) . '/es',
00351 'array_shift( $nowiki )', $munged );
00352 return $final;
00353 }
00354
00355
00356 function placeholder( $x = null ) {
00357 return '\xffplaceholder\xff';
00358 }
00359
00360 function nowikiPlaceholder( $matches ) {
00361 global $nowiki;
00362 $nowiki[] = $matches[1];
00363 return placeholder();
00364 }
00365
00366