Python: Find & Replace Strings in Unicode Files

Advertise Here

, ,

This page shows python scripts for doing find and replace on unicode encoded files.

Here's how one'd do it for a file encoded with utf-16.

# -*- coding: utf-8 -*-
# Python

# find and replace many pairs of strings in sequence in a utf-16 file

filePath='/Users/t/web/p/x/x001.html'
outFile=filePath+'~-~'

findreplace = [
(u'<title>西游记</title>', u'<title>西游记 (Monkey King)</title>'),
]

inF = open(filePath,'rb')
s=unicode(inF.read(),'utf-16')
inF.close()

for couple in findreplace:
    outtext=s.replace(couple[0],couple[1])
    s=outtext

outF = open(outFile,'wb')
outF.write(outtext.encode('utf-16'))
outF.close()

This script was used to work on a classic Chinese novel that's encoded in utf-16. (the files has since been changed to utf-8 encoding.)

UTF-8 Version

Here's a script that does multi-pair find & replace for all html files in a dir, assuming the encoding is utf-8.

# -*- coding: utf-8 -*-
# Python

import os,sys,shutil

mydir= '/Users/t/web/p/xyz'

findreplace = [
('find1','replace1'),
('find2','replace2'),
]

def replaceStringInFile(filePath):
   "replaces all findStr by repStr in file filePath"
   print filePath
   tempName=filePath+'~x~'
   backupName=filePath+'~~'

   inF = open(filePath,'rb')
   s=unicode(inF.read(),'utf-8')
   inF.close()

   for couple in findreplace:
       outtext=s.replace(couple[0],couple[1])
       s=outtext
   outF = open(tempName,'wb')
   outF.write(outtext.encode('utf-8'))
   outF.close()

   shutil.copy2(filePath,backupName)
   os.remove(filePath)
   os.rename(tempName,filePath)

def myfun(dummy, dirr, filess):
    for child in filess:
        if '.html' == os.path.splitext(child)[1] and os.path.isfile(dirr+'/'+child):
            replaceStringInFile(dirr+'/'+child)
            print child

os.path.walk(mydir, myfun, 'dummy')

See also: Perl: Find & Replace on Multiple Files.

blog comments powered by Disqus