用 python 文字处理,把 latex 文件中,
中文 English 中文 -> 中文~ English 中文
使得最后排版的效果中文后有空格。
要求
1. python 2.x
2. tex 文件为 gbk 编码
# -*- coding: gbk -*-
import sys
import os
import subprocess
# TO ADD '~' between 中文English
def notASCII(c):
return ord(c) > 127
def isASCII(c):
return ord(c) <= 127
def inNotTildeSet(c):
return notASCII(c) or (c=='~') or (c=='}') or (c==')') or (c=='\'') or (c=='"')
chinesePunctuation = ['。'.decode('gbk'),
','.decode('gbk'),
'、'.decode('gbk'),
':'.decode('gbk'),
';'.decode('gbk'),
'“'.decode('gbk'),
'”'.decode('gbk'),
'‘'.decode('gbk'),
'’'.decode('gbk'),
'?'.decode('gbk'),
'('.decode('gbk'),
')'.decode('gbk'),
'『'.decode('gbk'),
'』'.decode('gbk'),
'《'.decode('gbk'),
'》'.decode('gbk'),
]
def isChinesePunctuation(c):
for p in chinesePunctuation:
if c==p:
return True
return False
tex = r"" #original file
tex2 = r"" # output file
buffer = open(tex, 'rU').read() #read all content to a buffer
buffer = buffer.decode('gbk')
of = open(tex2, 'w')
i = 0;
while True:
if i > len(buffer)-1:
break;
else:
if (notASCII(buffer[i]) and (not isChinesePunctuation(buffer[i]))) and isASCII(buffer[i+1]): #是中文且不是中文标点符号,且下一个字符不是中文
j=i+1
while True: #循环直到 j 指向第一个非空格字符
if(buffer[j] != ' '):
break;
else:
j=j+1
for k in range(i,j): #打印 i 到 j-1
of.write(buffer[k].encode('gbk'))
if not inNotTildeSet(buffer[j]): #如果 j 不在“不添加~集合”中
of.write('~')
i=j
else:
of.write(buffer[i].encode('gbk'))
i=i+1
of.close()
0 comments:
Post a Comment