用 python 文字处理,把 latex 文件中,
中文 English 中文 -> 中文~ English 中文
1. python 2.x
2. tex 文件为 gbk 编码
# -*- coding: gbk -*- import sys import os import subprocess # TO ADD '~' between 中文English def notASCII(c): return ord(c) > 127 def isASCII(c): return ord(c) <= 127 def inNotTildeSet(c): return notASCII(c) or (c=='~') or (c=='}') or (c==')') or (c=='\'') or (c=='"') chinesePunctuation = ['。'.decode('gbk'), ','.decode('gbk'), '、'.decode('gbk'), ':'.decode('gbk'), ';'.decode('gbk'), '“'.decode('gbk'), '”'.decode('gbk'), '‘'.decode('gbk'), '’'.decode('gbk'), '?'.decode('gbk'), '('.decode('gbk'), ')'.decode('gbk'), '『'.decode('gbk'), '』'.decode('gbk'), '《'.decode('gbk'), '》'.decode('gbk'), ] def isChinesePunctuation(c): for p in chinesePunctuation: if c==p: return True return False tex = r"" #original file tex2 = r"" # output file buffer = open(tex, 'rU').read() #read all content to a buffer buffer = buffer.decode('gbk') of = open(tex2, 'w') i = 0; while True: if i > len(buffer)-1: break; else: if (notASCII(buffer[i]) and (not isChinesePunctuation(buffer[i]))) and isASCII(buffer[i+1]): #是中文且不是中文标点符号,且下一个字符不是中文 j=i+1 while True: #循环直到 j 指向第一个非空格字符 if(buffer[j] != ' '): break; else: j=j+1 for k in range(i,j): #打印 i 到 j-1 of.write(buffer[k].encode('gbk')) if not inNotTildeSet(buffer[j]): #如果 j 不在“不添加~集合”中 of.write('~') i=j else: of.write(buffer[i].encode('gbk')) i=i+1 of.close()
Post a Comment