Jun 15, 2011

CTEX 中英文混排加空格

目标:
用 python 文字处理,把 latex 文件中,
中文 English 中文 -> 中文~ English 中文
使得最后排版的效果中文后有空格。

要求
1. python 2.x
2. tex 文件为 gbk 编码

# -*- coding: gbk -*- 
import sys
import os
import subprocess

# TO ADD '~' between 中文English

def notASCII(c): 
 return ord(c) > 127
 
def isASCII(c): 
 return ord(c) <= 127

def inNotTildeSet(c):
 return notASCII(c) or (c=='~') or (c=='}') or (c==')') or (c=='\'') or (c=='"') 


chinesePunctuation = ['。'.decode('gbk'),
                      ','.decode('gbk'),
                      '、'.decode('gbk'),
                      ':'.decode('gbk'),
                      ';'.decode('gbk'),
                      '“'.decode('gbk'),
                      '”'.decode('gbk'),
                      '‘'.decode('gbk'),
                      '’'.decode('gbk'),
                      '?'.decode('gbk'),
                      '('.decode('gbk'),
                      ')'.decode('gbk'),
                      '『'.decode('gbk'),
                      '』'.decode('gbk'),
                      '《'.decode('gbk'),
                      '》'.decode('gbk'),
]


def isChinesePunctuation(c):
 for p in chinesePunctuation:
  if c==p:
   return True
 return False
 


tex = r""  #original file
tex2 = r"" # output file

buffer = open(tex, 'rU').read()  #read all content to a buffer
buffer = buffer.decode('gbk')

of = open(tex2, 'w')

i = 0;
while True:
 if i > len(buffer)-1:
  break;
 else:  
  if (notASCII(buffer[i]) and (not isChinesePunctuation(buffer[i])))  and isASCII(buffer[i+1]): #是中文且不是中文标点符号,且下一个字符不是中文 
   j=i+1
   while True: #循环直到 j 指向第一个非空格字符
    if(buffer[j] != ' '):
     break;
    else:
     j=j+1
   for k in range(i,j):  #打印 i 到 j-1
    of.write(buffer[k].encode('gbk'))
   if not inNotTildeSet(buffer[j]): #如果 j 不在“不添加~集合”中
    of.write('~')
   i=j
  else:
   of.write(buffer[i].encode('gbk'))
   i=i+1

of.close()

0 comments: