பயனர்:Maathavan/replacer.py
import re import time import codecs import pywikibot import pagegenerators import editarticle from pywikibot import i18n import webbrowser
- Imports predefined replacements tasks from fixes.py
import fixes
- This is required for the text that is shown when you run this script
- with the parameter -help.
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp, '&fixes-help;': fixes.help,
}
class XmlDumpReplacePageGenerator:
def __init__(self, xmlFilename, xmlStart, replacements, exceptions): self.xmlFilename = xmlFilename self.replacements = replacements self.exceptions = exceptions self.xmlStart = xmlStart self.skipping = bool(xmlStart)
self.excsInside = [] if "inside-tags" in self.exceptions: self.excsInside += self.exceptions['inside-tags'] if "inside" in self.exceptions: self.excsInside += self.exceptions['inside'] import xmlreader self.site = pywikibot.getSite() dump = xmlreader.XmlDump(self.xmlFilename) self.parser = dump.parse()
def __iter__(self): try: for entry in self.parser: if self.skipping: if entry.title != self.xmlStart: continue self.skipping = False if not self.isTitleExcepted(entry.title) \ and not self.isTextExcepted(entry.text): new_text = entry.text for old, new in self.replacements: new_text = pywikibot.replaceExcept( new_text, old, new, self.excsInside, self.site) if new_text != entry.text: yield pywikibot.Page(self.site, entry.title) except KeyboardInterrupt: try: if not self.skipping: pywikibot.output( u'To resume, use "-xmlstart:%s" on the command line.' % entry.title) except NameError: pass
def isTitleExcepted(self, title): if "title" in self.exceptions: for exc in self.exceptions['title']: if exc.search(title): return True if "require-title" in self.exceptions: for req in self.exceptions['require-title']: if not req.search(title): # if not all requirements are met: return True
return False
def isTextExcepted(self, text): if "text-contains" in self.exceptions: for exc in self.exceptions['text-contains']: if exc.search(text): return True return False
class ReplaceRobot:
""" A bot that can do text replacements. """
def __init__(self, generator, replacements, exceptions={}, acceptall=False, allowoverlap=False, recursive=False, addedCat=None, sleep=None, editSummary=, articles=None, exctitles=None):
self.generator = generator self.replacements = replacements self.exceptions = exceptions self.acceptall = acceptall self.allowoverlap = allowoverlap self.recursive = recursive if addedCat: site = pywikibot.getSite() self.addedCat = pywikibot.Page(site, addedCat, defaultNamespace=14) self.sleep = sleep # Some function to set default editSummary should probably be added self.editSummary = editSummary self.articles = articles self.exctitles = exctitles
# An edit counter to split the file by 100 titles if -save or -savenew # is on, and to display the number of edited articles otherwise. self.editcounter = 0 # A counter for saved exceptions self.exceptcounter = 0
def isTitleExcepted(self, title): """ Iff one of the exceptions applies for the given title, returns True. """ if "title" in self.exceptions: for exc in self.exceptions['title']: if exc.search(title): return True if "require-title" in self.exceptions: for req in self.exceptions['require-title']: if not req.search(title): return True return False
def isTextExcepted(self, original_text): if "text-contains" in self.exceptions: for exc in self.exceptions['text-contains']: if exc.search(original_text): return True return False
def doReplacements(self, original_text): new_text = original_text exceptions = [] if "inside-tags" in self.exceptions: exceptions += self.exceptions['inside-tags'] if "inside" in self.exceptions: exceptions += self.exceptions['inside'] for old, new in self.replacements: if self.sleep is not None: time.sleep(self.sleep) new_text = pywikibot.replace(new_text, old, new, exceptions, allowoverlap=self.allowoverlap) return new_text
def writeEditCounter(self): if self.articles: pywikibot.output(u'%d title%s saved.' % (self.editcounter, (lambda x: bool(x-1) and 's were' or ' was') (self.editcounter))) else: pywikibot.output(u'%d page%s changed.' % (self.editcounter, (lambda x: bool(x-1) and 's were' or ' was') (self.editcounter)))
def writeExceptCounter(self): """ This writes the counter of saved exceptions if applicable. """ if self.exctitles: pywikibot.output(u'%d exception%s saved.' % (self.exceptcounter, (lambda x: bool(x-1) and 's were' or ' was') (self.exceptcounter)))
def splitLine(self): if self.editcounter % 100: return else: return (u'\n' % self.editcounter)
def run(self): """ Starts the robot. """ # Run the generator which will yield Pages which might need to be # changed. for page in self.generator: if self.isTitleExcepted(page.title()): pywikibot.output( u'Skipping %s because the title is on the exceptions list.' % page.title(asLink=True)) continue try: # Load the page's text from the wiki original_text = page.get(get_redirect=True) if not (self.articles or page.canBeEdited()): pywikibot.output(u"You can't edit page %s" % page.title(asLink=True)) continue except pywikibot.NoPage: pywikibot.output(u'Page %s not found' % page.title(asLink=True)) continue new_text = original_text while True: if self.isTextExcepted(new_text): pywikibot.output(u'Skipping %s because it contains text ' u'that is on the exceptions list.' % page.title(asLink=True)) break new_text = self.doReplacements(new_text) if new_text == original_text: pywikibot.output(u'No changes were necessary in %s' % page.title(asLink=True)) break if self.recursive: newest_text = self.doReplacements(new_text) while newest_text != new_text: new_text = 'asf' newest_text = self.doReplacements(new_text) if hasattr(self, "addedCat"): cats = page.categories() if self.addedCat not in cats: cats.append(self.addedCat) new_text = pywikibot.replaceCategoryLinks(new_text, cats) # Show the title of the page we're working on. # Highlight the title in purple. pywikibot.output(u"\n\n>>> \03{lightpurple}%s\03{default} <<<" % page.title()) pywikibot.showDiff(original_text, new_text) if self.acceptall: break if self.exctitles: choice = pywikibot.inputChoice( u'Do you want to accept these changes?', ['Yes', 'No', 'no+eXcept', 'Edit', 'open in Browser', 'All', 'Quit'], ['y', 'N', 'x', 'e', 'b', 'a', 'q'], 'N') else: choice = pywikibot.inputChoice( u'Do you want to accept these changes?', ['Yes', 'No', 'Edit', 'open in Browser', 'All', 'Quit'], ['y', 'N', 'e', 'b', 'a', 'q'], 'N') if choice == 'e': editor = editarticle.TextEditor() as_edited = editor.edit(original_text) # if user didn't press Cancel if as_edited and as_edited != new_text: new_text = as_edited continue if choice == 'b': webbrowser.open("http://%s%s" % ( page.site.hostname(), page.site.nice_get_address(page.title()) )) i18n.input('pywikibot-enter-finished-browser') try: original_text = page.get(get_redirect=True, force=True) except pywikibot.NoPage: pywikibot.output(u'Page %s has been deleted.' % page.title()) break new_text = original_text continue if choice == 'q': self.writeEditCounter() self.writeExceptCounter() return if choice == 'a': self.acceptall = True if choice == 'x': # May happen only if self.exctitles isn't None self.exctitles.write( u"ur'^%s$',\n" % re.escape(page.title())) self.exctitles.flush() self.exceptcounter += 1 if choice == 'y': if not self.articles: # Primary behaviour: working on wiki page.put_async(new_text, self.editSummary) self.editcounter += 1 # Bug: this increments even if put_async fails # This is separately in two clauses of if for # future purposes to get feedback form put_async else: # Save the title for later processing instead of editing self.editcounter += 1 self.articles.write(u'#%s\n%s' % (page.title(asLink=True, textlink=True), self.splitLine())) self.articles.flush() # For the peace of our soul :-) # choice must be 'N' break if self.acceptall and new_text != original_text: if not self.articles: #Primary behaviour: working on wiki try: page.put(new_text, self.editSummary) self.editcounter += 1 # increment only on success except pywikibot.EditConflict: pywikibot.output(u'Skipping %s because of edit conflict' % (page.title(),)) except pywikibot.SpamfilterError, e: pywikibot.output( u'Cannot change %s because of blacklist entry %s' % (page.title(), e.url)) except pywikibot.PageNotSaved, error: pywikibot.error(u'putting page: %s' % (error.args,)) except pywikibot.LockedPage: pywikibot.output(u'Skipping %s (locked page)' % (page.title(),)) else: #Save the title for later processing instead of editing self.editcounter += 1 self.articles.write(u'#%s\n%s' % (page.title(asLink=True, textlink=True), self.splitLine())) self.articles.flush()
#Finally: self.writeEditCounter() self.writeExceptCounter()
def prepareRegexForMySQL(pattern):
pattern = pattern.replace('\s', '[:space:]') pattern = pattern.replace('\d', '[:digit:]') pattern = pattern.replace('\w', '[:alnum:]') pattern = pattern.replace("'", "\\" + "'") #pattern = pattern.replace('\\', '\\\\') #for char in ['[', ']', "'"]: # pattern = pattern.replace(char, '\%s' % char) return pattern
def main(*args):
add_cat = None gen = None # summary message summary_commandline = False # Array which will collect commandline parameters. # First element is original text, second element is replacement text. commandline_replacements = [] # A list of 2-tuples of original text and replacement text. replacements = [] # Don't edit pages which contain certain texts. exceptions = { 'title': [], 'text-contains': [], 'inside': [], 'inside-tags': [], 'require-title': [], # using a seperate requirements dict needs some } # major refactoring of code.
# Should the elements of 'replacements' and 'exceptions' be interpreted # as regular expressions? regex = False # Predefined fixes from dictionary 'fixes' (see above). fix = None # the dump's path, either absolute or relative, which will be used # if -xml flag is present xmlFilename = None useSql = False PageTitles = [] # will become True when the user presses a ('yes to all') or uses the # -always flag. acceptall = False # Will become True if the user inputs the commandline parameter -nocase caseInsensitive = False # Will become True if the user inputs the commandline parameter -dotall dotall = False # Will become True if the user inputs the commandline parameter -multiline multiline = False # Do all hits when they overlap allowoverlap = False # Do not recurse replacement recursive = False # This is the maximum number of pages to load per query maxquerysize = 60 # This factory is responsible for processing command line arguments # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory() # Load default summary message. # BUG WARNING: This is probably incompatible with the -lang parameter. editSummary = i18n.twtranslate(pywikibot.getSite(), 'replace-replacing', {'description': u}) # Between a regex and another (using -fix) sleep some time (not to waste # too much CPU sleep = None # Do not save the page titles, rather work on wiki filename = None # The name of the file to save titles titlefile = None # The file object itself # If we save, primary behaviour is append rather then new file append = True # Default: don't write titles to exception file and don't read them. excoutfilename = None # The name of the file to save exceptions excoutfile = None # The file object itself # excinfilename: reserved for later use (reading back exceptions) # If we save exceptions, primary behaviour is append excappend = True
# Read commandline parameters. for arg in pywikibot.handleArgs(*args): if arg == '-regex': regex = True elif arg.startswith('-xmlstart'): if len(arg) == 9: xmlStart = pywikibot.input( u'Please enter the dumped article to start with:') else: xmlStart = arg[10:] elif arg.startswith('-xml'): if len(arg) == 4: xmlFilename = i18n.input('pywikibot-enter-xml-filename') else: xmlFilename = arg[5:] elif arg == '-sql': useSql = True elif arg.startswith('-page'): if len(arg) == 5: PageTitles.append(pywikibot.input( u'Which page do you want to change?')) else: PageTitles.append(arg[6:]) elif arg.startswith('-saveexcnew'): excappend = False if len(arg) == 11: excoutfilename = pywikibot.input( u'Please enter the filename to save the excepted titles' + u'\n(will be deleted if exists):') else: excoutfilename = arg[12:] elif arg.startswith('-saveexc'): if len(arg) == 8: excoutfilename = pywikibot.input( u'Please enter the filename to save the excepted titles:') else: excoutfilename = arg[9:] elif arg.startswith('-savenew'): append = False if len(arg) == 8: filename = pywikibot.input( u'Please enter the filename to save the titles' + u'\n(will be deleted if exists):') else: filename = arg[9:] elif arg.startswith('-save'): if len(arg) == 5: filename = pywikibot.input( u'Please enter the filename to save the titles:') else: filename = arg[6:] elif arg.startswith('-replacementfile'): if len(arg) == len('-replacementfile'): replacefile = pywikibot.input( u'Please enter the filename to read replacements from:') else: replacefile = arg[len('-replacementfile')+1:] try: commandline_replacements.extend( [x.lstrip(u'\uFEFF').rstrip('\r\n') for x in codecs.open(replacefile, 'r', 'utf-8')]) except IOError: raise pywikibot.Error( '\n%s cannot be opened. Try again :-)' % replacefile) elif arg.startswith('-excepttitle:'): exceptions['title'].append(arg[13:]) elif arg.startswith('-requiretitle:'): exceptions['require-title'].append(arg[14:]) elif arg.startswith('-excepttext:'): exceptions['text-contains'].append(arg[12:]) elif arg.startswith('-exceptinside:'): exceptions['inside'].append(arg[14:]) elif arg.startswith('-exceptinsidetag:'): exceptions['inside-tags'].append(arg[17:]) elif arg.startswith('-fix:'): fix = arg[5:] elif arg.startswith('-sleep:'): sleep = float(arg[7:]) elif arg == '-always': acceptall = True elif arg == '-recursive': recursive = True elif arg == '-nocase': caseInsensitive = True elif arg == '-dotall': dotall = True elif arg == '-multiline': multiline = True elif arg.startswith('-addcat:'): add_cat = arg[8:] elif arg.startswith('-summary:'): editSummary = arg[9:] summary_commandline = True elif arg.startswith('-allowoverlap'): allowoverlap = True elif arg.startswith('-query:'): maxquerysize = int(arg[7:]) else: if not genFactory.handleArg(arg): commandline_replacements.append(arg)
if pywikibot.verbose: pywikibot.output(u"commandline_replacements: " + ', '.join(commandline_replacements))
if (len(commandline_replacements) % 2): raise pywikibot.Error, 'require even number of replacements.' elif (len(commandline_replacements) == 2 and fix is None): replacements.append((commandline_replacements[0], commandline_replacements[1])) if not summary_commandline: editSummary = i18n.twtranslate(pywikibot.getSite(), 'replace-replacing', {'description': ' (-%s +%s)' % (commandline_replacements[0], commandline_replacements[1])}) elif (len(commandline_replacements) > 1): if (fix is None): for i in xrange(0, len(commandline_replacements), 2): replacements.append((commandline_replacements[i], commandline_replacements[i + 1])) if not summary_commandline: pairs = [(commandline_replacements[i], commandline_replacements[i + 1]) for i in range(0, len(commandline_replacements), 2)] replacementsDescription = '(%s)' % ', '.join( [('-' + pair[0] + ' +' + pair[1]) for pair in pairs]) editSummary = i18n.twtranslate(pywikibot.getSite(), 'replace-replacing', {'description': replacementsDescription}) else: raise pywikibot.Error( 'Specifying -fix with replacements is undefined') elif fix is None: old = u'என்' new = u'எனது' change = '(-' + old + ' +' + new replacements.append((old, new)) if not summary_commandline: default_summary_message = i18n.twtranslate(pywikibot.getSite(), 'replace-replacing', {'description': change}) pywikibot.output(u'The summary message will default to: %s' % default_summary_message) summary_message = pywikibot.input( u'Press Enter to use this default message, or enter a ' + u'description of the\nchanges your bot will make:') if summary_message == : summary_message = default_summary_message editSummary = summary_message
else: # Perform one of the predefined actions. fixname = fix # Save the name for passing to exceptions function. try: fix = fixes.fixes[fix] except KeyError: pywikibot.output(u'Available predefined fixes are: %s' % fixes.fixes.keys()) return if "regex" in fix: regex = fix['regex'] if "msg" in fix: if isinstance(fix['msg'], basestring): editSummary = i18n.twtranslate(pywikibot.getSite(), str(fix['msg'])) else: editSummary = pywikibot.translate(pywikibot.getSite(), fix['msg']) if "exceptions" in fix: exceptions = fix['exceptions'] # Try to append common extensions for multiple fixes. # It must be either a dictionary or a function that returns a dict. if 'include' in exceptions: incl = exceptions['include'] if callable(incl): baseExcDict = incl(fixname) else: try: baseExcDict = incl except NameError: pywikibot.output( u'\nIncluded exceptions dictionary does not exist.' u' Continuing with the exceptions\ngiven in fix.\n') baseExcDict = None if baseExcDict: for l in baseExcDict: try: exceptions[l].extend(baseExcDict[l]) except KeyError: exceptions[l] = baseExcDict[l] if "recursive" in fix: recursive = fix['recursive'] if "nocase" in fix: caseInsensitive = fix['nocase'] try: replacements = fix['replacements'] # enable regex/replacements as a dictionary for different langs if isinstance(replacements, dict): replacements = replacements[pywikibot.getSite().lang] except KeyError: pywikibot.output( u"No replacements given in fix.") return
# Set the regular expression flags flags = re.UNICODE if caseInsensitive: flags = flags | re.IGNORECASE if dotall: flags = flags | re.DOTALL if multiline: flags = flags | re.MULTILINE
# Pre-compile all regular expressions here to save time later for i in range(len(replacements)): old, new = replacements[i] if not regex: old = re.escape(old) oldR = re.compile(old, flags) replacements[i] = oldR, new
for exceptionCategory in ['title', 'require-title', 'text-contains', 'inside']: if exceptionCategory in exceptions: patterns = exceptions[exceptionCategory] if not regex: patterns = [re.escape(pattern) for pattern in patterns] patterns = [re.compile(pattern, flags) for pattern in patterns] exceptions[exceptionCategory] = patterns
if xmlFilename: try: xmlStart except NameError: xmlStart = None gen = XmlDumpReplacePageGenerator(xmlFilename, xmlStart, replacements, exceptions) elif useSql: whereClause = 'WHERE (%s)' % ' OR '.join( ["old_text RLIKE '%s'" % prepareRegexForMySQL(old.pattern) for (old, new) in replacements]) if exceptions: exceptClause = 'AND NOT (%s)' % ' OR '.join( ["old_text RLIKE '%s'" % prepareRegexForMySQL(exc.pattern) for exc in exceptions]) else: exceptClause = query = u"""
SELECT page_namespace, page_title FROM page JOIN text ON (page_id = old_id) %s %s LIMIT 200""" % (whereClause, exceptClause)
gen = pagegenerators.MySQLPageGenerator(query) elif PageTitles: pages = [pywikibot.Page(pywikibot.getSite(), PageTitle) for PageTitle in PageTitles] gen = iter(pages)
gen = genFactory.getCombinedGenerator(gen) if not gen: # syntax error, show help text from the top of this file pywikibot.showHelp('replace') return
preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber=maxquerysize)
# Finally we open the file for page titles or set parameter article to None if filename: try: # This opens in strict error mode, that means bot will stop # on encoding errors with ValueError. # See http://docs.python.org/library/codecs.html#codecs.open titlefile = codecs.open(filename, encoding='utf-8', mode=(lambda x: x and 'a' or 'w')(append)) except IOError: pywikibot.output("%s cannot be opened for writing." % filename) return # The same process with exceptions file: if excoutfilename: try: excoutfile = codecs.open( excoutfilename, encoding='utf-8', mode=(lambda x: x and 'a' or 'w')(excappend)) except IOError: pywikibot.output("%s cannot be opened for writing." % excoutfilename) return bot = ReplaceRobot(preloadingGen, replacements, exceptions, acceptall, allowoverlap, recursive, add_cat, sleep, editSummary, titlefile, excoutfile) try: bot.run() finally: # Just for the spirit of programming (they were flushed) if titlefile: titlefile.close() if excoutfile: excoutfile.close()
if __name__ == "__main__":
try: main() finally: pywikibot.stopme()