Utilisateur:Patator Bot

Un article de Wikipédia, l'encyclopédie libre.

Patator Bot est un bot expérimental maintenu par user:Mr Patate

Pour l'instant, Patator se propose de faire des recherches de copyvios.

Après un essai de création du code ex-nihilo presque concluant (gros problème de mémoire au bout de ~50 articles traités) qui m'a permit de me familiariser avec python, je me base maintenant sur les bibliothèques de pywikipedia.

voici l'avancement actuel du code :

#! usr/bin/python
# -*- Coding utf-8 -*-
#(c) [[fr:user:Mr Patate]]
#with real drops of pywikipedia/wikipedia.py (by Rob W.W. Hooft and Andre Engels)
#code within.
#Therefore the the license of this script may be [[MIT license]]
#or any compatible one.
#...
#Anyway, who cares?

############################    WARNING    ####################################
#This script uses google. According to google terms of use, bots are not      #
#allowed to send querries. This bot is experimental and this script should not#
#be runned                                                                    #
###############################################################################

'''Checks texts such as Newpages or newbies contribs to find copyvios
This bot works alone, but beacause of the number of pages and texts to check,
it may work in network in the future '''

import login
import wikipedia
import re,datetime

global site,dataPage,resultPage

###################### config ####################

pwd = '' #password
lang = 'fr'
resultPage = 'user:Patator Bot/Resultats' #this page displays results
dataPage = 'user:Patator Bot/Donnees' #this one keep datas as Wlists, last checked articles...
site = wikipedia.Site(lang)


##################################################

def getNewpages(number = 50) :

    throttle = True
    path = site.newpages_address()[:-2] + str(number)
    print path
    wikipedia.get_throttle()
    html = site.getUrl(path)

    #this regex gives some quite odd results for "username", but I don't
    #speak regex fluently therefore I won't fix it today
    entryR = re.compile('<li>(?P<date>.+?) <a href=".+?" title="(?P<title>.+?)">.+?</a> \((?P<length>\d+)(.+?)\) \. \. (?P<loggedin><a href=".+?" title=".+?">)?(?P<username>.+?)(</a>)?( <em>\((?P<comment>.+?)\)</em>)?</li>')

    List = []
    for m in entryR.finditer(html):
        date = m.group('date')
        date = date2datetime(date)
        title = m.group('title')
        title = title.replace('"', '"')
        username = m.group('username')
        enduser = re.search('</a>', username).start()  #can be replaced by a correct regex 
        username=username[:enduser]
        if date > datas.LastNewpage:
            newpagesList.append(textToCheck(title, date, username))

    for i in newpagesList:         #debug
        print i.title,i.username
            
def date2datetime(date):
    ''' Changes a string into a datetime.datetime object.
    Works only with french strings (u'JJ mois AAAA à HH:MM')
    and 'YYYY-MM-DD HH:MM' form.'''

    if date[4]=='-' :
        date = date.replace('-',' ')
        date = date.replace(':',' ')
        date = date.split()
        date = datetime.datetime(int(date[0]),int(date[1]),int(date[2]),int(date[3]),int(date[4]))

    elif date[-7] == u'à' :
        monthCode= {u'janvier':1,u'février':2,u'mars':3,u'avril':4,u'mai':5,
                    u'juin':6,u'juillet':7,u'août':8,u'septembre':9,
                    u'octobre':10,u'novembre':11,u'décembre':12}
        date = date.replace(':',' ')
        date = date.split()
        date = datetime.datetime(int(date[2]),int(monthCode[date[1]]),
                                 int(date[0]),int(date[4]),int(date[5]))
        
    else :
        print 'Date error'  #I'll change this as soon as I have read Python man
        date ='Error'       # :)
   
    return date

class datas:

    datas = wikipedia.Page(site,dataPage).get()

    startUserWlist = re.search('<userWlist>', datas).end()
    endUserWlist = re.search('</userWlist>', datas).start()
    UserWlist = datas[startUserWlist:endUserWlist]

    startSiteWlist = re.search('<siteWlist>', datas).end()
    endSiteWlist = re.search('</siteWlist>', datas).start()
    SiteWlist = datas[startSiteWlist:endSiteWlist]

    startLastNewpage = re.search('<last newpage checked>', datas).end()
    endLastNewpage = re.search('</last newpage checked>', datas).start()
    LastNewpage = date2datetime(datas[startLastNewpage:endLastNewpage])

class textToCheck:
    '''stores articles and texts that must be checked'''
    def __init__(self,title,date,user):
        self.title=title
        self.username=user
        self.date=date
            
################### Main program ################# 
login.LoginManager(pwd,False,site)     
getNewpages()
wikipedia.stopme()
    

rr