Perl-Python: Find and Replace By Regex Text Patterns

Xah Lee, 2005-02-21

Previously we had scripts that replace strings for all files in a dir. See find and replace strings with Perl or Python.

However, sometimes you want to use a text pattern instead of just strings. For example, suppose you are working in HTML and you want links of this form:

<a href="http://en2.wikipedia.org/wiki/Artemis">Artemis</a>

to become:

<a href="http://en2.wikipedia.org/wiki/Artemis">Artemis↗</a>

You need a text pattern to do the job. In particular, you want a pattern that matches starting with “<a href="”, then followed by a url, then followed by “">”, and a text, and the closing tag </a>. And, you want to be able to capture the url and the link text, so that you can use them in your replacement string.

The following is a modified version of our previous Find and Replace script. This version, will replace strings on all html files in a dir by a regex pattern.

(always do backup before use)

# -*- coding: utf-8 -*-
# Python

import os, sys, re

mydir= '/Users/t/web/Periodic_dosage_dir/t1'

patn=r'''<a href="(http[^"]+)">([^>]+)</a>'''
replacemt=r'''<a href="\1">\2↗</a>'''

def replaceStringInFile(filePath):
   "replaces all string by a regex substitution"
   tempName=filePath+'~~~'
   input = open(filePath)
   output = open(tempName,'w')
   s=input.read()

   outtext = re.sub(patn, replacemt, s)

   output.write(outtext)
   output.close()
   input.close()
   os.rename(tempName,filePath)
   print filePath

def myfun(dummy, dirr, filess):
    for child in filess:
#        if child == '20040428_xelso_ranmi.html':
        if '.html' == os.path.splitext(child)[1] and os.path.isfile(dirr+'/'+child):
            replaceStringInFile(dirr+'/'+child)
            print child
os.path.walk(mydir, myfun, 3)

The following version does find & replace strings for all html files in a dir, by a given list of pairs of regex string patterns. The files are assumed to be utf8 encoded. (which works fine if they are ascii, because ascii is a subset of utf8)

# -*- coding: utf-8 -*-
# Python

# find and replace in a dir by multiple pairs of regex

import os, sys,shutil,re

mydir= '/Users/xah/some'

findreplace = [

(re.compile(ur'''<a href="http://(?P<urlpart>(?:[^.]+\.)?wikipedia.org[^"]+)">http://([\w\.]+)+/(\w+/)+(?P<title>[^<]+)</a>''',re.U|re.M),
ur'''<a href="http://\g<urlpart>">\g<title>↗</a>'''), # wikipedia

# more regex pairs here
]


def replaceStringInFile(filePath):
   "replaces all string by a regex substitution"
   backupName=filePath+'~re~'

   print 'reading:', filePath
   input = open(filePath,'rb')
   s=unicode(input.read(),'utf-8')
   input.close()

   numRep=None
   for couple in findreplace:
      if numRep == None:
         numRep = re.search(couple[0],s)
      outtext = re.sub(couple[0],couple[1], s)
      s=outtext

   if numRep:
      print ' writing:', filePath
      shutil.copy2(filePath,backupName)
      outF = open(filePath,'r+b')
      outF.read() # we do this way to preserve file creation date
      outF.seek(0)
      outF.write(outtext.encode('utf-8'))
      outF.truncate()
      outF.close()

def myfun(dummy, curdir, filess):
   for child in filess:
      if re.search(r'.+\.html$',child,re.U) and os.path.isfile(curdir+'/'+child):
         replaceStringInFile(curdir+'/'+child)

os.path.walk(mydir, myfun, 3)

For a full-featured script that does find-replace in Perl, see: Find & Replace on Multiple Files with Perl


See also:


Page created: 2005-02.
© 2005 by Xah Lee.
Xah Signet