#!/usr/bin/env python """ Searches and lists (optionally removing) duplicate files in specified directories. If directories are not specified, then the PATH environment variable is examined. """ """ HISTORY: v 0.1 Sometime in 2001 - Initial release v 0.2 06/08/2002 - check if directories searched are not really the same directory v 0.3 29/09/2002 - handle files with spaces in them. add force option to keep only the latest version v 0.4 13/07/2004 - treat links as normal files too, so that masking links get detected print number of duplicate files they'll have to go through Class-ified 2005-03-12 19:05:41 Alejandro Dubrovsky * survive failure to delete file * switch to optparse commandline parsing * added ignore file option 2005-04-09 18:08:38 Alejandro Dubrovsky * point out if duplicated directories point to each other 2005-04-20 13:06:10 Alejandro Dubrovsky * ignore directories specified in the ignore file too 2006-07-15 18:25:14 Alejandro Dubrovsky * sort list of duplicates alphabetically 2011-05-08 Alejandro Dubrovsky * Remove portage dependency * Modernise code a little bit """ import sys,os import time import logging from collections import defaultdict __author__ = 'Alejandro Dubrovsky' DefaultIgnoreFile = os.path.expanduser("~/.rmdups/ignore") class FileInfo(object): def __init__(self,name,size,ctime,realpath = ''): self.name = name self.size = size self.ctime = ctime self.realpath = realpath def formatInfo(self): return self.name + '\t' + str(self.size) + '\t' + time.ctime(self.ctime) def confirm(message='', write=sys.stdout.write, read=sys.stdin.readline): input = '' while input not in ['y','n']: write(message) input = read().strip()[:1].lower() return input == 'y' def confirmDelete(filename): message = 'Delete %s (y/n)?' % filename deleted = False if confirm(message): try: os.remove(filename) deleted = True except (IOError, OSError), e: logging.error('could not remove %s: %s' % (filename,e)) return deleted def constructPathList(paths): pathList = [] inodes = set() #add a directory to the searched path. check if it's not already in list # or if it happens to be the same directory as one on the list for p in paths: if not p in pathList: if os.path.isdir(p): inode = os.stat(p).st_ino if inode not in inodes: pathList.append(p) inodes.add(inode) else: logging.warn('%s does not exist or is not a directory' % p) return pathList def readFile(filename): """ Return list of filenames/directories. Skip entries #commented out """ try: fin = open(filename) except (OSError, IOError), e: logging.error("Could not open %s for reading: %s" % (filename, e)) return [] entries = [] for line in fin: if line.startswith("#"): continue if len(line.strip()) == 0: continue entries.append(line.strip()) return entries def main(args): from optparse import OptionParser parser = OptionParser('Usage: rmdups [-l|-f] [-v] [-h] [-i ignorefile] [directories]', epilog=__doc__) parser.add_option("-v", "--verbose", dest="verbose", default=False, action="store_true",help="Be verbose") parser.add_option("-l", "--listonly", dest="listonly", default=False, action="store_true", help="list only (do not remove any files)") parser.add_option("-f", "--force", dest="force", default=False, action="store_true", help="keep latest version, remove all others.") parser.add_option("-i", "--ignorefile", dest="ignoreFile", default=DefaultIgnoreFile, action="store", help="file containing list of files and directories to ignore when searching for duplicates. (default: %default") options, rest = parser.parse_args(args) dos = os.path trouble = {} allfiles = {} troubledirs = {} alldirs = {} pointsto = defaultdict(list) dirpointsto = defaultdict(list) ignore = set() # files and directories to ignore badlinks = [] if options.verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.WARN) if options.force and options.listonly: options.force = False logging.warn('-f and -l should not be used together. Getting rid of -f for safety reasons') path = constructPathList(rest) if len(path) == 0: #if no directories were listed, add all the directories in the PATH environment #variable, making sure to add only unique directories. path = constructPathList(os.environ['PATH'].split(':')) #load up list of files to ignore if options.ignoreFile: ignorelist = readFile(os.path.expanduser(options.ignoreFile)) for i in ignorelist: ignore.add(i) path = [directory for directory in path if directory not in ignore] for directory in path: for each in os.listdir(directory): if each in ignore: continue f = os.path.join(directory,each) thestats = os.lstat(f) #info is the vital info kept for each file, name size and modtime info = FileInfo(f, thestats.st_size,thestats.st_mtime) #keep track of dangling links if dos.islink(f): isalink = 1 if not dos.exists(f): badlinks.append(info) else: isalink = 0 if dos.isfile(f): name = dos.basename(f) info.realpath = dos.realpath(f) #if it's a link, keep track of which file it points to if isalink: whereto = info.realpath pointsto[whereto].append(f) #keep a list of all tracked files, keep duplicates in trouble if trouble.has_key(name) and (not info.realpath in [f.realpath for f in trouble[name]]): trouble[name].append(info) elif allfiles.has_key(name) and (info.realpath != allfiles[name].realpath): trouble[name] = [info,allfiles[name]] else: allfiles[name] = info elif dos.isdir(f): #keep track of duplicated directories too info.realpath = dos.realpath(f) name = dos.basename(f) if isalink: whereto = info.realpath dirpointsto[whereto].append(f) if troubledirs.has_key(name): troubledirs[name].append(info) elif alldirs.has_key(name): troubledirs[name] = [info,alldirs[name]] else: alldirs[name] = info #path.remove(directory) if len(badlinks) > 0: print 'Links to nowhere:' for link in badlinks: filename = link.name whereto = os.readlink(filename) print link.formatInfo() + ' -> ' + whereto if not options.listonly: confirmDelete(filename) troubledFiles = trouble.values() troubledFiles.sort(key=lambda x: x[0].name) if troubledFiles: print 'Duplicated files (%d instances):' % (len(troubledFiles),) for files in troubledFiles: for f in files: print f.formatInfo() filename = f.name for link in pointsto[filename]: print 'is pointed to by ' + link if dos.islink(filename): whereto = os.readlink(filename) print ' points to %s ' % (whereto,) if not options.listonly: if options.force: maxtime = 0 which = -1 for t in range(len(files)): if files[t].ctime > maxtime: which = t maxtime = files[t].ctime if which >= 0: files[which:which+1] = [] for f in files: logging.info('Removing ' + f.formatInfo()) try: os.remove(f.name) except (IOError, OSError), e: logging.error('Could not remove %s: %s' % (f.name,e)) else: for f in files: filename = f.name confirmDelete(filename) for link in pointsto[filename]: confirmDelete(link) print if troubledirs: print 'Duplicated directories:' for dir in troubledirs.values(): for f in dir: print f.formatInfo() if f.name in dirpointsto: print "pointed to by %s" % ",".join(d for d in dirpointsto[f.name]) print if __name__ == '__main__': sys.exit(main(sys.argv[1:]))