Pygrep -- a Python context Grep.

#!/usr/local/bin/python """ Program to perform a grep on files, by Hugh Sasse. This is a context grep program which will descend into directories if required, and supress unprintable characters. The module holds classes: Pygrep: This does the searching of files, and returns exit codes as appropriate. It also produces results in self.results which can be handled with methods of the Pygrep_results class. Pygrep_results: This is a dictionary of matching files with the lines that match. The methods it supports are for combining search results in various ways beyond the functionality of the Pygrep class. Author: Hugh Sasse Institution: De Montfort University, Leicester UK You may use this software and modify it for your own needs, and redistribute it freely, but you may not claim that your wrote it. Enhancements would be welcomed. Andrew Kuchling has already suggested a huge speedup. Many thanks to him. """ import sys # for file i/o etc import string # for character translations, etc import re # for regular expression matching etc. import getopt # for argument processing. import os # for handling the files and dirs import glob # In case we get * or ? or [..] import stat # for finding out what a file is. from UserDict import UserDict # for the base class of Pygrep_results # globchars that may be used in a glob style match. globchars = re.compile(r'[\!\*\?\[\]]') # global functions to wrap up the stat functions. def is_dir(file): """Test if filespec is a directory """ mystat=os.stat(file) mymode=mystat[stat.ST_MODE] return stat.S_ISDIR(mymode) def is_reg_file(file): """Test if filespec is a regular file """ mystat=os.stat(file) mymode=mystat[stat.ST_MODE] return stat.S_ISREG(mymode) def is_link(file): """Test if filespec is a symbolic link """ mystat=os.lstat(file) mymode=mystat[stat.ST_MODE] return stat.S_ISLNK(mymode) class Pygrep_err(Exception): """ Exceptions raised by the pygrep class """ pass; class Pygrep_results(UserDict): """ results of a call to pygrep.pygrep the results clsass is a dicitionary of filespecs which are strings, and the entries in each is a list of matching line numbers. This class extends the dictionary class to allow setwise combination of results. """ # We can use the dictionary methods for most things. def __init__(self, value=None): UserDict.__init__(self) if value != None: for key in value.keys(): self[key] = value[key][:] return None def __or__(self, other): """ OR two pygrep_results together: a | b NOTE this is bot a bitwise or. It is a file by file,and then line by line or. """ new = Pygrep_results() # a new dictionary for the results. for key in self.keys(): if other.has_key(key): if len(other[key]) > 0: newlist = [] mylist = self[key] + other[key] for i in range(len(mylist)): if (not mylist[i] in mylist[i+1:]): newlist.append(mylist[i]) new[key] = newlist[:] else: new[key] = self[key][:] for key in other.keys(): if not self.has_key(key): # because already copied if len(other[key]) > 0: new[key] = other[key][:] for key in new.keys(): new[key].sort() return new def __add__(self, other): """ Add two pygrep_results together: a + b This is the same as an or, because of the way they are merged. """ new = self | other return new def __and__(self, other): """ AND two pygrep_results together: a & b Only files which both have some lines that match will be retained. This is not a line by line AND, because that can be done with the pygrep method's -j option, and we need a filewise AND somewhere. It can also be done with a * (__mul__). """ new = Pygrep_results() # a new dictionary for the results. for key in self.keys(): if other.has_key(key): # OR the actual lines together. newlist = [] mylist = self[key] + other[key] for i in range(len(mylist)): if (not mylist[i] in mylist[i+1:]): newlist.append(mylist[i]) new[key] = newlist[:] for key in new.keys(): new[key].sort() return new def __xor__(self, other): """ XOR two pygrep_results together: a ^ b this is a filewise XOR. A line by line XOR can be done with (A * B) % B. """ new = Pygrep_results() # a new dictionary for the results. for key in self.keys(): if not other.has_key(key): new[key] = self[key][:] for key in keys(other): if not self.has_key(key): new[key] = self[key][:] for key in new.keys(): new[key].sort() return new def __mul__(self, other): """ linewise AND two Pygrep_results together: a * b Only lines which are in both self and other will be retained. This is basically the same as -j option to the pygrep method. """ new = Pygrep_results() # a new dictionary for the results. for key in self.keys(): if other.has_key(key): # OR the actual lines together. newlist = [] for i in self[key]: if i in other[key]: newlist.append(i) new[key] = newlist[:] for key in new.keys(): new[key].sort() return new def __mod__(self, other): """ remove other's lines from self : a - b Only lines which are in self but not in other will remain. This is not a commutative operation. """ new = Pygrep_results() # a new dictionary for the results. for key in self.keys(): if other.has_key(key): # subtract the actual lines.... newlist = [] for i in self[key]: if not (i in other[key]): newlist.append(i) new[key] = newlist[:] else: new[key] = self[key][:] for key in new.keys(): new[key].sort() return new def __sub__(self, other): """ remove files matching other from self : a - b Only files which are in self but not in other will remain. This is not a commutative operation. '-' was used because this is what most people mean by subtracting results. """ new = Pygrep_results() # a new dictionary for the results. for key in self.keys(): if not other.has_key(key): new[key] = self[key][:] for key in new.keys(): new[key].sort() return new def display(self): """display the set of results in some generic form This displays the lines in the files, with the line numbers. When context searching is in use the results may be a bit odd if lines of one file are knocked out by combining with another. """ for file in self.keys(): try: fp = open(file) lines = fp.readlines() fp.close() except IOError: sys.stderr.write("pygrep: Unable to open %s for reading\n" % file) for line in self[file]: print "%s:%d:%s" % file, line, lines[line - 1] pass # test the class and bomb out def test_Pygrep_results(): print "In test_Pygrep_results()..." results = Pygrep_results() print "results is ",results results = Pygrep_results({"This":[1,2,3], "That":[4,5,6]}) print "results is ",results results2 = Pygrep_results({"That":[4,5,7]}) print "results2 is ",results2 results3 = results * results2 print "results3 is ",results3 results4 = results & results2 print "results4 is ",results4 results5 = results2 & results print "results5 is ",results5 results6 = results | results2 print "results6 is ",results6 results7 = results | results2 print "results7 is ",results7 results8 = results - results2 print "results8 is ",results8 results9 = results2 - results print "results9 is ",results9 results10 = results % results2 print "results10 is ",results10 results11 = results2 % results print "results11 is ",results11 results12 = results + results2 print "results12 is ",results12 results13 = results2 + results print "results13 is ",results13 print "Leaving test_Pygrep_results()..." return None class Pygrep: """grep files using Perl REs. This takes a list of args in the same way as the program does, and sorts out what do do with getopt... """ # We need to know what constitutes control chars of # the sort that won't appear in a text file. For this # reason backspace, del, formfeed and vertical tab are # removed. These were handled by regular expression # substitutions, but Andrew Kuchling # suggested I change them tu use string.translate. # This was a huge speedup, so Thank You, Andrew. control_chars = string.join(map(chr, range(0,011) + [013,014] + range(016,040)),"") # Characters with the hi bit set can cause problems, so # we may wish to knock them out. hi_bit_chars = string.join(map(chr, range(0200,0400)),"") # print "hi_bit_chars is",len(hi_bit_chars),"characters long" binary_re = re.compile(r'[\000-\010\013\014\016-\037\200-\377]') # now we need a table to map [\200-\377] to [\000-\177] # Remember range(a,b) gives [a,...b-1] table = range(0,128) * 2 # convert to a string. hi_lo_table = string.join(map(chr,table),"") # print "hi_lo_table is",len(hi_lo_table),"characters long" all_chars = string.join(map(chr,range(0,256)),"") # print "all_chars is",len(all_chars),"characters long" def __init__(self): """ Initilise the flags etc """ self.a_flag = self.A_flag = self.B_flag = \ self.c_flag = self.C_flag = self.d_flag = self.D_flag = \ self.E_flag = self.F_flag = \ self.h_flag = self.H_flag = self.i_flag = self.I_flag = self.l_flag = \ self.n_flag = self.R_flag = self.s_flag = self.S_flag = \ self.v_flag = self.V_flag = 0 self.and_flag = 0; self.patterns = [] self.cpatterns = []; # compiled patterns self.patfiles = [] self.files = [] self.status = 0 # the exit status self.show_dashes = 0 self.show_names = 0 self.show_number = 0 self.line_count = 0 self.results = Pygrep_results() def set_options(self, optlist): """Given a list of options, set the flags. The flags are set to 1 if they are boolean, or to the supplied value. If the help flags are used then the usage proc is called and this funtion exits, returning 1, so you can tell if this has happened. """ for option, value in optlist: if option == '-a': self.a_flag = 1 elif option == '-A': self.A_flag = 1 elif option == '-B': self.B_flag = 1 elif (option == '-c') or (option == "--count"): self.c_flag = 1 elif (option == '-C') or (option == "--Context"): self.C_flag = string.atoi(value); elif option == '-d': self.d_flag = 1 elif option == '-D': self.D_flag = 1 elif option == '-e': self.patterns.append(value) elif option == '-E': self.E_flag = 1 elif option == '-f': self.patfiles.append(value) elif option == '-F': self.F_flag = 1 elif (option == '-h') or (option == "--head"): self.h_flag = 1 elif (option == '-H') or (option == '--help'): self.usage() ### maybe raise exception?? return 1 elif option == '-i': self.i_flag = 1 elif option == '-I': self.I_flag = 1 elif (option == '-j') or (option == "--and"): self.and_flag = 1 elif option == "--or": # has no short form self.and_flag = 0 elif option == '-l': self.l_flag = 1 elif option == '-n': self.n_flag = 1 elif option == '-R': self.R_flag = 1 elif option == '-s': self.s_flag = 1 elif option == '-S': self.S_flag = 1 elif (option == '-v') or (option == "--invert"): self.v_flag = 1 elif (option == '-V') or (option == "--version"): self.V_flag = 1 print "pygrep version 1.5" return 1 return 0 def match_patterns(self, str): """ peform a map on the patterns to match string Actually use search as its results are more useful """ if self.i_flag: str = string.lower(str) if self.F_flag: # fixed strings, no patterns patterns = self.patterns result = map(lambda x,y=str : (string.find(y,x)>-1), patterns) else: cpatterns = self.cpatterns result = map(lambda x,y=str : x.search(y), cpatterns) return result def or_matches_together(self, matchlist): """ Given a list of matches, say if any matched """ result = reduce( lambda x,y: x or y, matchlist,0) return result def and_matches_together(self, matchlist): """ Given a list of matches, say if all matched """ result = reduce( lambda x,y: x and y, matchlist,1) return result def find_context(self, indices, maxind ): """ add lines 'in context' to a list of line numbers Work out the surrounding context of each listed line, and deal with overlaps and start/end of the file. maxind is the last possible line. """ if (self.C_flag % 2): # if it is odd context = self.C_flag - 1 else: context = self.C_flag half_context = context / 2 start = [] end = [] for i in indices: startpos = i - half_context if startpos < 1: start.append(1) else: start.append(startpos) endpos = i + half_context if endpos > maxind: end.append(maxind) else: end.append(endpos) # merge the start and end lists into one long # list with lots of redundancy. temp = [] for i in range(len(start)): temp = temp + range(start[i],end[i]+1) results = [] for i in range(len(temp)): # print "i is ",i,"temp[i] is ", temp[i] if (not temp[i] in temp[i+1:]): results.append(temp[i]) # print temp[i], " not in ", temp[i+1:] # else: # print temp[i]," in ", temp[i+1:] return results def expand_files(self): """ Expand a list of supplied files. This does globbing, expanding directories recursively, and checking for files that are not regular files. """ start = 0; while (start < len(self.files)): for f in range(start, len(self.files)): file = self.files[f] if file == "-": # this is stdin start = start + 1 continue if globchars.search(file) != None: self.files = self.files + glob.glob(file) self.files[f:f+1] = [] break # reprocess the new list if is_link(file): ostart = start try: if is_reg_file(file): start = start + 1 else: self.files[f:f+1] = [] except os.error: if not self.E_flag: sys.stderr.write("pygrep: %s is a link to a missing file or directory\n" % file) if not self.S_flag: self.status = 2 self.files[f:f+1] = [] if (start == ostart): break else: continue if is_dir(file): if self.R_flag: contents = os.listdir(file) # print "contents is ", contents, "\n" contents = map(lambda x,f=file: f + os.sep +x, contents) # print "contents is ", contents, "\n" self.files = self.files + contents else: if not self.E_flag: sys.stderr.write("pygrep: %s is a directory. Use -R for recorsion\n" % file) if not self.S_flag: self.status = 2 self.files[f:f+1] = [] break if not is_reg_file(file): if not self.E_flag: sys.stderr.write("pygrep: %s is not a regular file\n" % file) if not self.S_flag: self.status = 2 self.files[f:f+1] = [] break start = start + 1 def pygrep(self, args): """Do the actual grepping based on the supplied arguments. """ optlist, self.files = getopt.getopt(args, "aABcC:dDe:Ef:FhHiIjlnRsSvV", ["and", "context", "count", "head", "help", "invert", "or", "version"]) # print "optlist is ", optlist # print "files is ", self.files if self.set_options(optlist): # true if Usage is called or version requested. return 0 # Now read the patterns in the patfiles if any # into tee pattern list. for file in self.patfiles: fp = sys.open(file) lines = fp.readlines() fp.close() lines = map(lambda x:re.sub(r'\n$',"",x), lines) self.patterns.append(lines) if len(self.patterns) == 0: # we have no patterns in the option list # so the first "file" must be a pattern if len(self.files) > 0: self.patterns.append(self.files[0]) self.files = self.files[1:] else: sys.stderr.write("pygrep: No pattern supplied\n") self.usage() raise Pygrep_err, "No pattern supplied" # We now have a complete list of the patterns. # Searching for regular expressions is more expensive # than searching for strings. If our patterns # contain no metacharacters then we can use string # searches instead. if not filter(lambda x: re.match(r'[^\w\s]',x), self.patterns): # nothing like a metachar found here... self.F_flag = 1 if len(self.files) == 0: self.files = ["-"] #read from stdin if no files. # expand the list of files -- expand directories # to contents, globs to groups of files, etc self.expand_files() # If we are using fixed patterns we don't need to # compile the regexps. if self.F_flag: if self.i_flag: # If case insensitive make it all lower case. self.patterns = map(lambda x:string.lower(x),self.patterns) else: # not fixed patterns if self.i_flag: self.cpatterns = map(re.compile, self.patterns); else: self.cpatterns = map(lambda x:re.compile(x,re.I), self.patterns); if self.l_flag: self.show_names = 1 else: if (len(self.files) > 1) or (self.R_flag): self.show_names = 1 if self.C_flag: self.show_dashes = 1 if self.n_flag: self.show_number = 1 if self.D_flag: self.show_dashes = 0 # Now get down and do the grepping! for file in self.files: lines = [] try: if file == "-": lines = sys.stdin.readlines() else: fp = open(file) lines = fp.readlines() fp.close() except IOError: if not self.E_flag: sys.stderr.write("pygrep: Unable to read %s\n" % file) if not self.S_flag: self.status = 2 continue # try the next file. # filter out characters we don't want to # display . # first deal with the high bit chars... if not (self.a_flag or self.A_flag): lines = map(lambda x,y=self : string.translate(x, y.all_chars, y.hi_bit_chars), lines) if self.A_flag: lines = map(lambda x,y=self: string.translate(x, y.hi_lo_table),lines) if self.I_flag: found = 0 x = 0; while (not(found) and (x < len(lines))): if self.binary_re.match(lines[x]): found = 1 break x = x + 1 if found: continue # with next file if not self.B_flag: lines = map(lambda x,y=self : string.translate(x, y.all_chars, y.control_chars), lines) # now match the patterns agains the lines. # print file, ":len(lines) = ", len(lines) if self.and_flag: # and the patterns together indices = \ map(lambda x,y=self,z=lines: # for each index... ( y.and_matches_together( y.match_patterns(z[x]) ) ) and (x+1), range(len(lines)) ) else: # or the patterns together. indices = \ map(lambda x,y=self,z=lines: # for each index... ( y.or_matches_together( y.match_patterns(z[x]) ) ) and (x+1), range(len(lines)) ) # print file,":len(indices) matched = ", len(indices) # print "indices = ", indices indices = filter(None, indices) # print file,":len(indices) filtered = ", len(indices) # print "indices = ", indices if self.v_flag: # invert the logic indices = map(lambda x,y=indices: (((x+1) not in y) or None) and (x+1), # the or is to make Not return None range(len(lines)) ) indices = filter(None, indices) # print file, ":len(indices) inverted = ", len(indices) if (len(indices)>0): self.status = 1 if self.C_flag: indices = self.find_context(indices,len(lines)) # Now save the results in the variable. new_results = Pygrep_results({file:indices}) self.results = self.results + new_results # we are now ready to print out the results # for this file. if not self.s_flag: #silent if (len(indices) == 0): if self.d_flag: print file else: # len(indices) > 0 if self.d_flag: continue # with next file if self.l_flag: print file elif self.c_flag: if self.show_names: print "%s:%d" % (file, len(indices)) else: print "%d" % len(indices) self.line_count = self.line_count + len(indices) else: if self.show_dashes: print "---" for i in range(len(indices)-1): if self.show_names: print "%s:" % file, if self.show_number: print "%d:" % indices[i], print lines[indices[i]-1], if (indices[i]+1 != indices[i+1]) and self.show_dashes: print "---" if self.show_names: print "%s:" % file, if self.show_number: print "%d:" % indices[-1], print lines[indices[-1]-1], if self.show_dashes: print "---" if not self.s_flag: if self.c_flag: print "total = ",self.line_count return self.status def usage(self): """Explain how to use the pygrep utility This sends the explanation to the standard error stream because it may be called in the event of an error. """ sys.stderr.write( """ usage grep [-BcdEFhHilnRsSvV] [-C context] {-e pattern}|pattern|{-f patfile} path [ path ...] where these terms have the following meanings: options: -a allow 8 bit characters through. Normally filtered out. -A collapse 8-bit to 7-bit ASCII. Implies -a. -B allow Binary files through unfiltered. Normally control chars are filtered out. Characters with the high bit set may not be extended ASCII, so this option DOES NOT imply -a. -c Only count lines -C count lines of context to show. An odd number includes the line, even excludes it. -d list only names of files that Don't have matching lines -D Don't Display Dashes between matching segments -e pattern match this pattern -E suppress error messages -f patfile file of patterns to match, 1 per line -F Act as fgrep -- treat metachars as normal characters -h don't put names at start of the lines -H Give this help message -i ignore case in matches -I completely Ignore files containing binary characters (control chars). Takes account of -a and -A flags -j match lines using conJunction (AND) of patterns: lines match if all the patterns match. The default is disjunction (OR): lines match if any patterns match. -l list only names of files with matching lines -n include the linenumber in the outppt -R descend into directories Recursively. -s silent, don't write lines or filenames to output, just set the status. -S suppress setting the exit Status on error -v inVert the matching logic-- show lines that don't match -V Show the Version number, etc and the long options are: --and same as -j --context count same as -C --count same as -c --head same as -h --help same as -H --invert same as -i --or turns off -j. This is the default anyway --version same as -V path is the path of a file (or for -R, a directory) pygrep will search through the files and find lines that match the regular expression supplied. If there is more than one file to process then the nane of the file is prepended to the line (unless the -h option is in use. Default settings have been chosen so as to prevent garbage appearing on the screen (options -a -A -B) or excessive output (-R), but to give reasonable diagnostics (-s -S). Bugs: Too many options. Maybe the setup needs to be done interacting at a prompt!? So many options means the options must be case sensitive. -j for conjunction is an awful mnemonic for and. The program is written to read in a file at one go. It is thus memory hungry, but my memory stingy version (in Perl) ran too slowly. """) pass # to help with aligning text # Some of the processing we only need to do if # run as a program. if __name__ == '__main__': # test_Pygrep_results() grep = Pygrep() sys.exit(grep.pygrep(sys.argv[1:]))