Utilisateur:PimpBot/source

Un article de Wikipédia, l'encyclopédie libre.

# -*- coding: utf-8  -*-
"""
This module can do slight modifications to a wiki page source code such that
the code looks cleaner. The changes are not supposed to change the look of the
rendered wiki page.
 
WARNING: This module needs more testing!
"""
 
import wikipedia, pagegenerators
import sys
import re,string,codecs
 
# Summary message when using this module as a stand-alone script
msg_standalone = {
    'de': u'Bot: Kosmetische Änderungen',
    'en': u'Robot: Cosmetic changes',
    'fr': u'[[User:PimpBot|PimpBot]] : changements cosmétiques et orthographiques',
    'he': u'רובוט: שינויים קוסמטיים',
    'pt': u'Bot: Mudanças triviais',
    }
 
# Summary message  that will be appended to the normal message when
# cosmetic changes are made on the fly
msg_append = {
    'de': u'; kosmetische Änderungen',
    'en': u'; cosmetic changes',
    'fr': u'; changements cosmétiques',
    'he': u'; שינויים קוסמטיים',
    'pt': u'; mudanças triviais',
    }
 
deprecatedTemplates = {
    'wikipedia': {
        'de': [
            u'Stub',
        ]
    }
}
 
 
 
def replaceExceptMathNowikiLinksGalleryAndComments(text, old, new):
    """
    Replaces old by new in text, skipping occurences of old within nowiki tags
    and HTML comments.
 
    Parameters:
        text - a string
        old  - a compiled regular expression
        new  - a string
    """
    if type(old) == type('') or type(old) == type(u''):
        old = re.compile(old)
    nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?</nowiki>|<!--.*?-->|<math>.*?</math>|<gallery>.*?</gallery>|\{\{[^}]*?\}\}|\[\[[^\]\|]*?\||\[\[[^\]\|]*?\]\]|http://[^\s]+\s|ISBN[-\d\s]*|ASIN[-\d\s:]+|Tel\s?[:]\s?[\d-]+', re.IGNORECASE | re.DOTALL) # 
 
    # How much of the text we have looked at so far
    index = 0
    while True:
        match = old.search(text, index)
        if not match:
            break
        noTouchMatch = nowikiOrHtmlCommentR.search(text, index)
        if noTouchMatch and noTouchMatch.start() < match.start():
            # an HTML comment or text in nowiki tags stands before the next valid match. Skip.
            index = noTouchMatch.end()
        else:
            # We found a valid match. Replace it.
            text = text[:match.start()] + old.sub(new, text[match.start():match.end()]) + text[match.end():]
            # continue the search on the remaining text
            index = match.start() + len(new)
    return text
 
 
# also no change in the external links
def replaceExceptMathNowikiLinksGalleryAndComments2(text, old, new):
    """
    Replaces old by new in text, skipping occurences of old within nowiki tags
    and HTML comments.
 
    Parameters:
        text - a string
        old  - a compiled regular expression
        new  - a string
    """
    if type(old) == type('') or type(old) == type(u''):
        old = re.compile(old)
    nowikiOrHtmlCommentR = re.compile(r'<nowiki>.*?</nowiki>|<!--.*?-->|<math>.*?</math>|<gallery>.*?</gallery>|\{\{[^}]*?\}\}|\[\[[^\]\|]*?\||\[\[[^\]\|]*?\]\]|http://[^\s]+\s|\[http://[^\s]+\s[^\]]+\]|www\.[^\s]+\s|ISBN-13[:]?[-\d\s]*|ISBN[-\d\s]*|ASIN[-\d\s:]+|Tel\s?[:]\s?[\d-]+|#([AaBbCcEeFf]|\d){6}', re.IGNORECASE | re.DOTALL) # 
 
    # How much of the text we have looked at so far
    index = 0
    while True:
        match = old.search(text, index)
        if not match:
            break
        noTouchMatch = nowikiOrHtmlCommentR.search(text, index)
        if noTouchMatch and noTouchMatch.start() < match.start():
            # an HTML comment or text in nowiki tags stands before the next valid match. Skip.
            index = noTouchMatch.end()
        else:
            # We found a valid match. Replace it.
            text = text[:match.start()] + old.sub(new, text[match.start():match.end()]) + text[match.end():]
            # continue the search on the remaining text
            index = match.start() + len(new)
    return text
 
 
 
 
class CosmeticChangesToolkit:
    def __init__(self, site, debug = False):
        self.site = site
        self.debug = debug
 
    def change(self, text):
        """
        Given a wiki source code text, returns the cleaned up version.
        """
        oldText = text
        text = self.standardizeInterwiki(text)
        text = self.standardizeCategories(text)
        #text = self.cleanUpLinks(text)
        text = self.cleanUpSectionHeaders(text)
        text = self.translateAndCapitalizeNamespaces(text)
        text = self.removeDeprecatedTemplates(text)
        text = self.resolveHtmlEntities(text)
        text = self.validXhtml(text)
        text = self.removeUselessSpaces(text)
        text = self.replaceWithNiceQuotes(text)
        text = self.replaceIer(text)
        text = self.replaceNumbers(text)
        text = self.replaceParenthesis(text)
        text = self.replaceCenturies(text)
 
        return text
 
    def standardizeInterwiki(self, text):
        """
        Makes sure that interwiki links are put to the correct position and
        into the right order.
        """
        interwikiLinks = wikipedia.getLanguageLinks(text, insite = self.site)
        text = wikipedia.replaceLanguageLinks(text, interwikiLinks, site = self.site)
        return text
 
    def standardizeCategories(self, text):
        """
        Makes sure that interwiki links are put to the correct position, but
        does not sort them.
        """
        categories = wikipedia.getCategoryLinks(text, site = self.site)
        text = wikipedia.replaceCategoryLinks(text, categories, site = self.site)
        return text
 
    def translateAndCapitalizeNamespaces(self, text):
        """
        Makes sure that localized namespace names are used.
        """
        family = self.site.family
        for nsNumber in family.namespaces:
            thisNs = family.namespace(self.site.lang, nsNumber)
            defaultNs = family.namespace('_default', nsNumber)
            if thisNs != defaultNs:
                text = wikipedia.replaceExceptNowikiAndComments(text, r'\[\[\s*' + defaultNs + '\s*:(?P<nameAndLabel>.*?)\]\]', r'[[' + thisNs + ':\g<nameAndLabel>]]')
        if self.site.nocapitalize: 
            for nsNumber in family.namespaces:
                thisNs = family.namespace(self.site.lang, nsNumber)
                lowerNs = thisNs[0].lower() + thisNs[1:] # this assumes that all NS names have length at least 2
                text = wikipedia.replaceExceptNowikiAndComments(text, r'\[\[\s*' + lowerNs + '\s*:(?P<nameAndLabel>.*?)\]\]', r'[[' + thisNs + ':\g<nameAndLabel>]]')
        return text
 
    def cleanUpLinks(self, text):
        trailR = re.compile(self.site.linktrail())
        # The regular expression which finds links. Results consist of four groups:
        # group title is the target page title, that is, everything before | or ].
        # group section is the page section. It'll include the # to make life easier for us.
        # group label is the alternative link title, that's everything between | and ].
        # group linktrail is the link trail, that's letters after ]] which are part of the word.
        # note that the definition of 'letter' varies from language to language.
        self.linkR = re.compile(r'\[\[(?P<titleWithSection>[^\]\|]+)(\|(?P<label>[^\]\|]*))?\]\](?P<linktrail>' + self.site.linktrail() + ')')
        curpos = 0
        # This loop will run until we have finished the current page
        while True:
            m = self.linkR.search(text, pos = curpos)
            if not m:
                break
            # Make sure that next time around we will not find this same hit.
            curpos = m.start() + 1
            titleWithSection = m.group('titleWithSection')
            if not self.site.isInterwikiLink(titleWithSection):
                # The link looks like this:
                # [[page_title|link_text]]trailing_chars
                # We only work on namespace 0 because pipes and linktrails work
                # differently for images and categories.
                page = wikipedia.Page(self.site, titleWithSection)
                if page.namespace() == 0:
                    # Replace underlines by spaces, also multiple underlines
                    titleWithSection = re.sub('_+', ' ', titleWithSection)
                    # Remove double spaces
                    titleWithSection = re.sub('  +', ' ', titleWithSection)
                    # Convert URL-encoded characters to unicode
                    titleWithSection = wikipedia.url2unicode(titleWithSection, site = self.site)
                    label = m.group('label') or titleWithSection
                    trailingChars = m.group('linktrail')
                    if trailingChars:
                        label += trailingChars
                    if titleWithSection == label:
                        newLink = "[[%s]]" % titleWithSection
                    # Check if we can create a link with trailing characters instead of a pipelink
                    elif len(titleWithSection) <= len(label) and label[:len(titleWithSection)] == titleWithSection and re.sub(trailR, '', label[len(titleWithSection):]) == '':
                        newLink = "[[%s]]%s" % (label[:len(titleWithSection)], label[len(titleWithSection):])
                    else:
                        # Try to capitalize the first letter of the title.
                        # Maybe this feature is not useful for languages that
                        # don't capitalize nouns...
                        #if not self.site.nocapitalize:
                        if self.site.sitename() == 'wikipedia:de':
                            titleWithSection = titleWithSection[0].upper() + titleWithSection[1:]
                        newLink = "[[%s|%s]]" % (titleWithSection, label)
                    text = text[:m.start()] + newLink + text[m.end():]
        return text
 
    def resolveHtmlEntities(self, text):
        ignore = [
             38,     # Ampersand (&amp;)
             60,     # Less than (&lt;)
             62,     # Less than (&lt;)
            160,     # Non-breaking space (&nbsp;) - not supported by Firefox textareas
        ]
        text = wikipedia.html2unicode(text, ignore = ignore)
        return text
 
    def validXhtml(self, text):
        text = wikipedia.replaceExceptNowikiAndComments(text, r'<br>', r'<br />')
        return text
 
    def removeUselessSpaces(self, text):
        result = []
        multipleSpacesR = re.compile('  +')
        spaceAtLineEndR = re.compile(' $')
        preR = re.compile('<pre', re.IGNORECASE)
        lines = text.split('\n')
        for line in lines:
            if len(line) > 0 and line[0] != ' ' and not preR.search(line):
                line = wikipedia.replaceExceptMathNowikiAndComments(line, multipleSpacesR, ' ')
                line = wikipedia.replaceExceptMathNowikiAndComments(line, spaceAtLineEndR, '')
            result.append(line)
        return '\n'.join(result)
 
    def cleanUpSectionHeaders(self, text):
        for level in range(1, 7):
            equals = '=' * level
            text = wikipedia.replaceExceptNowikiAndComments(text, r'\n' + equals + ' *(?P<title>[^=]+?) *' + equals + ' *\r\n', r'\n' + equals + ' \g<title> ' + equals + '\r\n')
        return text
 
    def removeDeprecatedTemplates(self, text):
        if deprecatedTemplates.has_key(self.site.family.name) and deprecatedTemplates[self.site.family.name].has_key(self.site.lang):
            for template in deprecatedTemplates[self.site.family.name][self.site.lang]:
                if not self.site.nocapitalize:
                    template = '[' + template[0].upper() + template[0].lower() + ']' + template[1:]
                text = wikipedia.replaceExceptNowikiAndComments(text, r'\{\{([mM][sS][gG]:)?' + template + '(?P<parameters>\|[^}]+|)}}', '')
        return text
 
    def replaceWithNiceQuotes(self, text):
        text2 = replaceExceptMathNowikiLinksGalleryAndComments( text, r'([^\'])\'([^\'\[])', r'\1' + u'’' +r'\2' )
        text3 = replaceExceptMathNowikiLinksGalleryAndComments( text2, r'([^\'])\'\[', r'\1' + u'’['  )
        text4 = wikipedia.replaceExceptMathNowikiAndComments( text3, r'\[\[([^:|\]\']*?)\'([^:|\]\']*?)\]\]', r'[[\1' + u'\'' + r'\2|\1' + u'’' + r'\2]]' )
        return text4
 
    def replaceIer(self, text):
        text2 = replaceExceptMathNowikiLinksGalleryAndComments( text, r'(\s)Ier(\s)', r'\1I{{er}}\2' )
        text3 = wikipedia.replaceExceptMathNowikiAndComments( text2, r'\[\[([^:|\]]*?)Ier([^:|\]]*?)\]\]', r'[[\1' + u'Ier' + r'\2|\1' + u'I{{er}}' + r'\2]]' )
        text4 = replaceExceptMathNowikiLinksGalleryAndComments( text3, r'(\s)(1)er(\s)', r'\1\2{{er}}\3' )
        text5 = wikipedia.replaceExceptMathNowikiAndComments( text4, r'\[\[([^:|\]]*?)1er([^:|\]]*?)\]\]', r'[[\1' + u' 1er' + r'\2|\1' + u' 1{{er}}' + r'\2]]' )
        text6 = wikipedia.replaceExceptMathNowikiAndComments( text5, r'(\s)(\d+)(eme|ème|e)(\s)', r'\1\2{{e}}\4' )
        return text6
 
    def replaceNumbers(self, text):
        text2 = replaceExceptMathNowikiLinksGalleryAndComments2( text, r'(\d{5,10})', r'' + u'{{formatnum:' + r'\1' + u'}}' )
        text3 = replaceExceptMathNowikiLinksGalleryAndComments2( text2, r'([^\w\d])(\d)( |&nbsp[;])(\d{3})([^\w/])', r'\1\2\4\5' )
        text4 = replaceExceptMathNowikiLinksGalleryAndComments2( text3, r'([^\w])(\d+)( |&nbsp[;])(\d{3})( |&nbsp[;])(\d{3})([^\w/])', r'\1' + u'{{formatnum:' + r'\2\4\6' + u'}}' +r'\7' )
        text5 = replaceExceptMathNowikiLinksGalleryAndComments2( text4, r'([^\w])(\d+)( |&nbsp[;])(\d{3})([^\w/])', r'\1' + u'{{formatnum:' + r'\2\4' + u'}}' + r'\5' )
        return text5
 
    def replaceParenthesis(self, text):
        text2 = replaceExceptMathNowikiLinksGalleryAndComments( text, r'([^\s])\(', r'\1 (' )
        return text2
 
    def replaceCenturies(self, text):
        text2 = replaceExceptMathNowikiLinksGalleryAndComments( text, r'([XVI]+)(e|' +u'è' + r'me|eme)' + u' siècle' + r'([^\w])(av|av\.|avant)\s(JC|J\.C\.|J\.-C|J\.-C\.|J-C)', u'{{' + r'\1' + u'e siècle av. J.-C.}}' )
        text3 = replaceExceptMathNowikiLinksGalleryAndComments( text2, r'([XVI]+)(e|' +u'è' + r'me|eme)' + u' siècle' + r'([^\w])', u'{{' + r'\1' + u'e siècle}}' + r'\3' )
        text4 = wikipedia.replaceExceptMathNowikiAndComments( text3, r'\[\[([XVI]+e' +u' siècle)\]\]', r'{{\1}}' )
        return text4
 
class DictToolkit:
    def __init__(self, site, debug = False):
        self.site = site
        self.debug = debug
        self.words = {}
        try:
            f = codecs.open( "dict.txt", 'r', encoding = site.encoding())
            for line in f.readlines():
                # remove trailing newlines and carriage returns
                try:
                    while line[-1] in ['\n', '\r']:
                        line = line[:-1]
                except IndexError:
                    pass
                #skip empty lines
                if line != '':
                    w = line.split(' ')
                    self.words[w[0]] = w[1]
            f.close()
        except IOError:
            print "Warning! There is no wordlist for your language!"
        else:
            print "Wordlist successfully loaded."
 
    def change(self, text):
        ct = text
        for (k, v) in self.words.iteritems():
            ct = replaceExceptMathNowikiLinksGalleryAndComments( ct, k, v )
        return ct
 
    def bad_text(self, text):
        for (k, v) in self.words.iteritems():
            old = re.compile(k)
            match = old.search(text, 0)
            if match:
                return True
        return False
 
class PimpBot:
    def __init__(self, generator, acceptall = False):
        self.generator = generator
        self.acceptall = acceptall
        self.debug = not acceptall
        # Load default summary message.
        wikipedia.setAction(wikipedia.translate(wikipedia.getSite(), msg_standalone))
        self.ccToolkit = CosmeticChangesToolkit(wikipedia.getSite(), debug = self.debug)
        self.dictToolkit = DictToolkit( wikipedia.getSite(), debug = self.debug)
 
 
    def run(self):
        for page in self.generator:
            try:
                changedText = page.get()
                changedText = self.ccToolkit.change(changedText)
                changedText = self.dictToolkit.change(changedText)
                if changedText != page.get():
                    if self.debug:
                        wikipedia.showDiff(page.get(), changedText)
 
                    if not self.acceptall:
                        choice = wikipedia.inputChoice(u'Do you want to accept these changes?',  ['Yes', 'No', 'All'], ['y', 'N', 'a'], 'N')
                        if choice in ['a', 'A']:
                            self.acceptall = True
                    if self.acceptall or choice in ['y', 'Y']:
                        page.put(changedText)
            except wikipedia.NoPage:
                print "Page %s does not exist?!" % page.aslink()
            except wikipedia.IsRedirectPage:
                print "Page %s is a redirect; skipping." % page.aslink()
            except wikipedia.LockedPage:
                print "Page %s is locked?!" % page.aslink()
 
 
def main():
    #page generator
    gen = None
    pageTitle = []
    for arg in wikipedia.handleArgs():
        if arg.startswith('-start:'):
            gen = pagegenerators.AllpagesPageGenerator(arg[7:])
        elif arg.startswith('-ref:'):
            referredPage = wikipedia.Page(wikipedia.getSite(), arg[5:])
            gen = pagegenerators.ReferringPageGenerator(referredPage)
        elif arg.startswith('-links:'):
            linkingPage = wikipedia.Page(wikipedia.getSite(), arg[7:])
            gen = pagegenerators.LinkedPageGenerator(linkingPage)
        elif arg.startswith('-file:'):
            gen = pagegenerators.TextfilePageGenerator(arg[6:])
        elif arg.startswith('-cat:'):
            cat = catlib.Category(wikipedia.getSite(), arg[5:])
            gen = pagegenerators.CategorizedPageGenerator(cat)
        else:
            pageTitle.append(arg)
 
    if pageTitle:
        page = wikipedia.Page(wikipedia.getSite(), ' '.join(pageTitle))
        gen = iter([page])
    if not gen:
        wikipedia.showHelp()
    else:
        preloadingGen = pagegenerators.PreloadingGenerator(gen)
        bot = PimpBot(preloadingGen)
        print "Bot is running"
        bot.run()
 
if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()