00001
00002
00003 import tarfile, zipfile
00004 import os, re, shutil, urllib.request
00005
00006
00007 SF_MIRROR = 'easynews'
00008 SCIM_TABLES_VER = '0.5.9'
00009 SCIM_PINYIN_VER = '0.5.91'
00010 LIBTABE_VER = '0.2.3'
00011
00012
00013 def GetFileFromURL( url, dest ):
00014 if os.path.isfile(dest):
00015 print( 'File %s up to date.' % dest )
00016 return
00017 print( 'Downloading from [%s] ...' % url )
00018 urllib.request.urlretrieve( url, dest )
00019 print( 'Download complete.\n' )
00020 return
00021
00022 def GetFileFromZip( path ):
00023 print( 'Extracting files from %s ...' % path )
00024 zipfile.ZipFile(path).extractall()
00025 return
00026
00027 def GetFileFromTar( path, member, rename ):
00028 print( 'Extracting %s from %s ...' % (rename, path) )
00029 tarfile.open(path, 'r:gz').extract(member)
00030 shutil.move(member, rename)
00031 tree_rmv = member.split('/')[0]
00032 shutil.rmtree(tree_rmv)
00033 return
00034
00035 def ReadBIG5File( dest ):
00036 print( 'Reading and decoding %s ...' % dest )
00037 f1 = open( dest, 'r', encoding='big5hkscs', errors='replace' )
00038 text = f1.read()
00039 text = text.replace( '\ufffd', '\n' )
00040 f1.close()
00041 f2 = open( dest, 'w', encoding='utf8' )
00042 f2.write(text)
00043 f2.close()
00044 return text
00045
00046 def ReadFile( dest ):
00047 print( 'Reading and decoding %s ...' % dest )
00048 f = open( dest, 'r', encoding='utf8' )
00049 ret = f.read()
00050 f.close()
00051 return ret
00052
00053 def ReadUnihanFile( dest ):
00054 print( 'Reading and decoding %s ...' % dest )
00055 f = open( dest, 'r', encoding='utf8' )
00056 t2s_code = []
00057 s2t_code = []
00058 while True:
00059 line = f.readline()
00060 if line:
00061 if line.startswith('#'):
00062 continue
00063 elif not line.find('kSimplifiedVariant') == -1:
00064 temp = line.split('kSimplifiedVariant')
00065 t2s_code.append( ( temp[0].strip(), temp[1].strip() ) )
00066 elif not line.find('kTraditionalVariant') == -1:
00067 temp = line.split('kTraditionalVariant')
00068 s2t_code.append( ( temp[0].strip(), temp[1].strip() ) )
00069 else:
00070 break
00071 f.close()
00072 return ( t2s_code, s2t_code )
00073
00074 def RemoveRows( text, num ):
00075 text = re.sub( '.*\s*', '', text, num)
00076 return text
00077
00078 def RemoveOneCharConv( text ):
00079 preg = re.compile('^.\s*$', re.MULTILINE)
00080 text = preg.sub( '', text )
00081 return text
00082
00083 def ConvertToChar( code ):
00084 code = code.split('<')[0]
00085 return chr( int( code[2:], 16 ) )
00086
00087 def GetDefaultTable( code_table ):
00088 char_table = {}
00089 for ( f, t ) in code_table:
00090 if f and t:
00091 from_char = ConvertToChar( f )
00092 to_chars = [ConvertToChar( code ) for code in t.split()]
00093 char_table[from_char] = to_chars
00094 return char_table
00095
00096 def GetManualTable( dest ):
00097 text = ReadFile( dest )
00098 temp1 = text.split()
00099 char_table = {}
00100 for elem in temp1:
00101 elem = elem.strip('|')
00102 if elem:
00103 temp2 = elem.split( '|', 1 )
00104 from_char = chr( int( temp2[0][2:7], 16 ) )
00105 to_chars = [chr( int( code[2:7], 16 ) ) for code in temp2[1].split('|')]
00106 char_table[from_char] = to_chars
00107 return char_table
00108
00109 def GetValidTable( src_table ):
00110 valid_table = {}
00111 for f, t in src_table.items():
00112 valid_table[f] = t[0]
00113 return valid_table
00114
00115 def GetToManyRules( src_table ):
00116 tomany_table = {}
00117 for f, t in src_table.items():
00118 for i in range(1, len(t)):
00119 tomany_table[t[i]] = True
00120 return tomany_table
00121
00122 def RemoveRules( dest, table ):
00123 text = ReadFile( dest )
00124 temp1 = text.split()
00125 for elem in temp1:
00126 f = ''
00127 t = ''
00128 elem = elem.strip().replace( '"', '' ).replace( '\'', '' )
00129 if '=>' in elem:
00130 if elem.startswith( '=>' ):
00131 t = elem.replace( '=>', '' ).strip()
00132 elif elem.endswith( '=>' ):
00133 f = elem.replace( '=>', '' ).strip()
00134 else:
00135 temp2 = elem.split( '=>' )
00136 f = temp2[0].strip()
00137 t = temp2[1].strip()
00138 try:
00139 table.pop(f, t)
00140 continue
00141 except:
00142 continue
00143 else:
00144 f = t = elem
00145 if f:
00146 try:
00147 table.pop(f)
00148 except:
00149 x = 1
00150 if t:
00151 for temp_f, temp_t in table.copy().items():
00152 if temp_t == t:
00153 table.pop(temp_f)
00154 return table
00155
00156 def DictToSortedList1( src_table ):
00157 return sorted( src_table.items(), key = lambda m: m[0] )
00158
00159 def DictToSortedList2( src_table ):
00160 return sorted( src_table.items(), key = lambda m: m[1] )
00161
00162 def Converter( string, conv_table ):
00163 i = 0
00164 while i < len(string):
00165 for j in range(len(string) - i, 0, -1):
00166 f = string[i:][:j]
00167 t = conv_table.get( f )
00168 if t:
00169 string = string[:i] + t + string[i:][j:]
00170 i += len(t) - 1
00171 break
00172 i += 1
00173 return string
00174
00175 def GetDefaultWordsTable( src_wordlist, src_tomany, char_conv_table, char_reconv_table ):
00176 wordlist = list( set( src_wordlist ) )
00177 wordlist.sort( key = len, reverse = True )
00178 word_conv_table = {}
00179 word_reconv_table = {}
00180 while wordlist:
00181 conv_table = {}
00182 reconv_table = {}
00183 conv_table.update( word_conv_table )
00184 conv_table.update( char_conv_table )
00185 reconv_table.update( word_reconv_table )
00186 reconv_table.update( char_reconv_table )
00187 word = wordlist.pop()
00188 new_word_len = word_len = len(word)
00189 while new_word_len == word_len:
00190 rvt_test = False
00191 for char in word:
00192 rvt_test = rvt_test or src_tomany.get(char)
00193 test_word = Converter( word, reconv_table )
00194 new_word = Converter( word, conv_table )
00195 if not reconv_table.get( new_word ):
00196 if not test_word == word:
00197 word_conv_table[word] = new_word
00198 word_reconv_table[new_word] = word
00199 elif rvt_test:
00200 rvt_word = Converter( new_word, reconv_table )
00201 if not rvt_word == word:
00202 word_conv_table[word] = new_word
00203 word_reconv_table[new_word] = word
00204 try:
00205 word = wordlist.pop()
00206 except IndexError:
00207 break
00208 new_word_len = len(word)
00209 return word_reconv_table
00210
00211 def GetManualWordsTable( src_wordlist, conv_table ):
00212 src_wordlist = [items.split('#')[0].strip() for items in src_wordlist]
00213 wordlist = list( set( src_wordlist ) )
00214 wordlist.sort( key = len, reverse = True )
00215 reconv_table = {}
00216 while wordlist:
00217 word = wordlist.pop()
00218 new_word = Converter( word, conv_table )
00219 reconv_table[new_word] = word
00220 return reconv_table
00221
00222 def CustomRules( dest ):
00223 text = ReadFile( dest )
00224 temp = text.split()
00225 ret = {temp[i]: temp[i + 1] for i in range( 0, len( temp ), 2 )}
00226 return ret
00227
00228 def GetPHPArray( table ):
00229 lines = ['\'%s\' => \'%s\',' % (f, t) for (f, t) in table]
00230
00231 return '\n'.join(lines)
00232
00233 def RemoveSameChar( src_table ):
00234 dst_table = {}
00235 for f, t in src_table.items():
00236 if not f == t:
00237 dst_table[f] = t
00238 return dst_table
00239
00240 def main():
00241
00242 url = 'ftp://ftp.unicode.org/Public/UNIDATA/Unihan.zip'
00243 han_dest = 'Unihan.zip'
00244 GetFileFromURL( url, han_dest )
00245
00246
00247 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-tables-%s.tar.gz' % ( SF_MIRROR, SCIM_TABLES_VER )
00248 tbe_dest = 'scim-tables-%s.tar.gz' % SCIM_TABLES_VER
00249 GetFileFromURL( url, tbe_dest )
00250
00251
00252 url = 'http://%s.dl.sourceforge.net/sourceforge/scim/scim-pinyin-%s.tar.gz' % ( SF_MIRROR, SCIM_PINYIN_VER )
00253 pyn_dest = 'scim-pinyin-%s.tar.gz' % SCIM_PINYIN_VER
00254 GetFileFromURL( url, pyn_dest )
00255
00256
00257 url = 'http://%s.dl.sourceforge.net/sourceforge/libtabe/libtabe-%s.tgz' % ( SF_MIRROR, LIBTABE_VER )
00258 lbt_dest = 'libtabe-%s.tgz' % LIBTABE_VER
00259 GetFileFromURL( url, lbt_dest )
00260
00261
00262
00263
00264 GetFileFromZip( han_dest )
00265
00266
00267 t_wordlist = []
00268 s_wordlist = []
00269
00270
00271 src = 'scim-tables-%s/tables/zh/EZ-Big.txt.in' % SCIM_TABLES_VER
00272 dst = 'EZ.txt.in'
00273 GetFileFromTar( tbe_dest, src, dst )
00274 text = ReadFile( dst )
00275 text = text.split( 'BEGIN_TABLE' )[1].strip()
00276 text = text.split( 'END_TABLE' )[0].strip()
00277 text = re.sub( '.*\t', '', text )
00278 text = RemoveOneCharConv( text )
00279 t_wordlist.extend( text.split() )
00280
00281
00282 src = 'scim-tables-%s/tables/zh/Wubi.txt.in' % SCIM_TABLES_VER
00283 dst = 'Wubi.txt.in'
00284 GetFileFromTar( tbe_dest, src, dst )
00285 text = ReadFile( dst )
00286 text = text.split( 'BEGIN_TABLE' )[1].strip()
00287 text = text.split( 'END_TABLE' )[0].strip()
00288 text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
00289 text = RemoveOneCharConv( text )
00290 s_wordlist.extend( text.split() )
00291
00292
00293 src = 'scim-tables-%s/tables/zh/Ziranma.txt.in' % SCIM_TABLES_VER
00294 dst = 'Ziranma.txt.in'
00295 GetFileFromTar( tbe_dest, src, dst )
00296 text = ReadFile( dst )
00297 text = text.split( 'BEGIN_TABLE' )[1].strip()
00298 text = text.split( 'END_TABLE' )[0].strip()
00299 text = re.sub( '.*\t(.*?)\t\d*', '\g<1>', text )
00300 text = RemoveOneCharConv( text )
00301 s_wordlist.extend( text.split() )
00302
00303
00304 src = 'scim-pinyin-%s/data/phrase_lib.txt' % SCIM_PINYIN_VER
00305 dst = 'phrase_lib.txt'
00306 GetFileFromTar( pyn_dest, src, dst )
00307 text = ReadFile( 'phrase_lib.txt' )
00308 text = re.sub( '(.*)\t\d\d*.*', '\g<1>', text)
00309 text = RemoveRows( text, 5 )
00310 text = RemoveOneCharConv( text )
00311 s_wordlist.extend( text.split() )
00312
00313
00314 src = 'libtabe/tsi-src/tsi.src'
00315 dst = 'tsi.src'
00316 GetFileFromTar( lbt_dest, src, dst )
00317 text = ReadBIG5File( 'tsi.src' )
00318 text = re.sub( ' \d.*', '', text.replace('# ', ''))
00319 text = RemoveOneCharConv( text )
00320 t_wordlist.extend( text.split() )
00321
00322
00323 t_wordlist = list( set( t_wordlist ) )
00324 s_wordlist = list( set( s_wordlist ) )
00325
00326
00327 text = ReadFile( 'simpphrases_exclude.manual' )
00328 temp = text.split()
00329 s_string = '\n'.join( s_wordlist )
00330 for elem in temp:
00331 s_string = re.sub( '.*%s.*\n' % elem, '', s_string )
00332 s_wordlist = s_string.split('\n')
00333
00334
00335 text = ReadFile( 'tradphrases_exclude.manual' )
00336 temp = text.split()
00337 t_string = '\n'.join( t_wordlist )
00338 for elem in temp:
00339 t_string = re.sub( '.*%s.*\n' % elem, '', t_string )
00340 t_wordlist = t_string.split('\n')
00341
00342
00343
00344 ( t2s_code, s2t_code ) = ReadUnihanFile( 'Unihan.txt' )
00345
00346 t2s_1tomany = {}
00347 t2s_1tomany.update( GetDefaultTable( t2s_code ) )
00348 t2s_1tomany.update( GetManualTable( 'trad2simp.manual' ) )
00349
00350 s2t_1tomany = {}
00351 s2t_1tomany.update( GetDefaultTable( s2t_code ) )
00352 s2t_1tomany.update( GetManualTable( 'simp2trad.manual' ) )
00353
00354 t2s_1to1 = GetValidTable( t2s_1tomany )
00355 s_tomany = GetToManyRules( t2s_1tomany )
00356
00357 s2t_1to1 = GetValidTable( s2t_1tomany )
00358 t_tomany = GetToManyRules( s2t_1tomany )
00359
00360 t2s_1to1 = RemoveRules( 'trad2simp_noconvert.manual', t2s_1to1 )
00361 s2t_1to1 = RemoveRules( 'simp2trad_noconvert.manual', s2t_1to1 )
00362
00363
00364 t2s_1to1_supp = t2s_1to1.copy()
00365 s2t_1to1_supp = s2t_1to1.copy()
00366
00367 t2s_1to1_supp.update( CustomRules( 'trad2simp_supp_set.manual' ) )
00368
00369 s2t_1to1_supp.update( CustomRules( 'simp2trad_supp_set.manual' ) )
00370
00371 text = ReadFile( 'simpphrases.manual' )
00372 s_wordlist_manual = text.split('\n')
00373 t2s_word2word_manual = GetManualWordsTable(s_wordlist_manual, s2t_1to1_supp)
00374 t2s_word2word_manual.update( CustomRules( 'toSimp.manual' ) )
00375
00376 text = ReadFile( 'tradphrases.manual' )
00377 t_wordlist_manual = text.split('\n')
00378 s2t_word2word_manual = GetManualWordsTable(t_wordlist_manual, t2s_1to1_supp)
00379 s2t_word2word_manual.update( CustomRules( 'toTrad.manual' ) )
00380
00381 s2t_supp = s2t_1to1_supp.copy()
00382 s2t_supp.update( s2t_word2word_manual )
00383 t2s_supp = t2s_1to1_supp.copy()
00384 t2s_supp.update( t2s_word2word_manual )
00385 t2s_word2word = GetDefaultWordsTable( s_wordlist, s_tomany, s2t_1to1_supp, t2s_supp )
00386
00387 t2s_word2word.update( t2s_word2word_manual )
00388
00389 s2t_word2word = GetDefaultWordsTable( t_wordlist, t_tomany, t2s_1to1_supp, s2t_supp )
00390
00391 s2t_word2word.update( s2t_word2word_manual )
00392
00393
00394
00395 t2s_1to1 = RemoveSameChar( t2s_1to1 )
00396 s2t_1to1 = RemoveSameChar( s2t_1to1 )
00397 toHans = DictToSortedList1( t2s_1to1 ) + DictToSortedList2( t2s_word2word )
00398
00399 toHant = DictToSortedList1( s2t_1to1 ) + DictToSortedList2( s2t_word2word )
00400
00401 toCN = DictToSortedList2( CustomRules( 'toCN.manual' ) )
00402
00403 toHK = DictToSortedList2( CustomRules( 'toHK.manual' ) )
00404
00405 toSG = DictToSortedList2( CustomRules( 'toSG.manual' ) )
00406
00407 toTW = DictToSortedList2( CustomRules( 'toTW.manual' ) )
00408
00409
00410 php = '''<?php
00411 /**
00412 * Simplified / Traditional Chinese conversion tables
00413 *
00414 * Automatically generated using code and data in includes/zhtable/
00415 * Do not modify directly!
00416 */
00417
00418 $zh2Hant = array(\n'''
00419 php += GetPHPArray( toHant )
00420 php += '\n);\n\n$zh2Hans = array(\n'
00421 php += GetPHPArray( toHans )
00422 php += '\n);\n\n$zh2TW = array(\n'
00423 php += GetPHPArray( toTW )
00424 php += '\n);\n\n$zh2HK = array(\n'
00425 php += GetPHPArray( toHK )
00426 php += '\n);\n\n$zh2CN = array(\n'
00427 php += GetPHPArray( toCN )
00428 php += '\n);\n\n$zh2SG = array(\n'
00429 php += GetPHPArray( toSG )
00430 php += '\n);'
00431
00432 f = open( 'ZhConversion.php', 'w', encoding = 'utf8' )
00433 print ('Writing ZhConversion.php ... ')
00434 f.write( php )
00435 f.close()
00436
00437 if __name__ == '__main__':
00438 main()