Utilisateur:ZX81-bot/Script/idh pays.pl

Un article de Wikipédia, l'encyclopédie libre.

#!/usr/bin/perl
 
use utf8;
 
my $username = 'ZX81-bot';
my $password = '********';
 
my $lang = 'fr';
my $eilimit = "5000";
 
use strict;
use LWP::UserAgent;
use XML::DOM;
use locale;
use POSIX qw(locale_h);
use HTML::Entities;
use Encode;
 
setlocale(LC_COLLATE, "fr_FR.UTF-8");
 
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
 
my $title = 'Modèle:Infobox Pays';
 
my (@articles,$article);
 
my $site_url = 'http://'.$lang.'.wikipedia.org';
my $login_url = $site_url . '/w/index.php?title=Special:Userlogin&action=submitlogin&type=login';
my $query_url = $site_url . '/w/query.php?what=embeddedin&titles=%s&eilimit=%s&eicontfrom=%s&format=xml';
my $edit_url = $site_url . '/w/index.php?action=edit&title=%s';
my $en_url = 'http://en.wikipedia.org' . '/w/index.php?action=raw&title=%s';
my $url;
 
my $ua = LWP::UserAgent->new();
$ua->agent('ZX81-bot/0.1 ' . $ua->_agent);
 
$ua->cookie_jar({ file => "$ENV{HOME}/.wikipedia.cookies", autosave => 1 });
 
$ua->default_header('Accept-Language' => 'fr, fr-fr, en, en-us');
$ua->default_header('Accept-Charset' => 'utf-8');
 
 
my $rep = $ua->post($login_url, [ wpName => $username,
                        wpPassword => $password,
                        wpRemember => '1',
                        wpLoginAttempt => 'Identification' ]);
 
die "L'authentification a échoué" if ($rep->code != 302);
 
print STDDERR "Auth : Ok\n";
 
my $eicontfrom = "";
 
my ($parser,$doc);
my ($ei,$ns,$query);
 
while (defined $eicontfrom) {
  $url = sprintf $query_url, $title, $eilimit, $eicontfrom;
  $rep = $ua->get($url);
  if ($rep->is_error) {
    print STDERR $rep->headers_as_string;
    die "La requête [$url] a échoué.\n", $rep->status_line, "\n";
  }
  $parser = new XML::DOM::Parser;
  $doc = $parser->parse($rep->content);
 
  foreach $ei ($doc->getElementsByTagName("ei")) {
    if (!$ei->getAttribute("ns")) {
      push @articles, $ei->getFirstChild->getNodeValue;
    }
  }
 
  $query = $doc->getElementsByTagName("query");
  if ($query->getLength != 0) {
    $eicontfrom = $query->item(0)->getElementsByTagName("embeddedin")->item(0)->getAttribute("next");
  } else {
    $eicontfrom = undef;
  }
}
 
$| = 0;
my ($page, $token, $time, $text, $enarticle, $summary);
my ($hdi,$hdi_year,$hdi_rank,$hdi_category,$idh);
 
foreach $article (@articles) {
  print "fr:$article : GET\n";
  $url = sprintf $edit_url, $article;
  $rep = $ua->get($url);
  if ($rep->is_error) {
    print STDERR $rep->headers_as_string;
    die "La requête [$url] a échoué.\n", $rep->status_line, "\n";
  }
 
  $page = $rep->content;
  Encode::_utf8_on($page);
 
  $page =~ m{<input type='hidden' value="([^"]*)" name="wpEditToken" />}s;
  $token = $1;
 
  $page =~ m{<input type='hidden' value="([^"]*)" name="wpEdittime" />}s;
  $time = $1;
 
  $page =~ m{<textarea [^>]*name="wpTextbox1"[^>]*>(.*)</textarea>}s;
  $text = $1;
  HTML::Entities::decode($text);
 
  if ($text =~ /\|\s*IDH\s*=/s) {
    print "fr:$article : passé\n";
    next;
  }
 
  ($hdi,$hdi_year,$hdi_rank,$hdi_category) = ("","","","");
 
  if ($text =~ /\[\[[ _]*en[ _]*:[ _]*([^\]]+)[ _]*\]\]/) {
    $enarticle = $1;
 
    print "en:$enarticle : GET\n";
    $url = sprintf $en_url, $enarticle;
    $rep = $ua->get($url);
    if ($rep->is_error) {
      print STDERR $rep->headers_as_string;
      die "La requête [$url] a échoué.\n", $rep->status_line, "\n";
    }
 
    $page = $rep->content;
    Encode::_utf8_on($page);
 
    if ($page =~ /\|\s*HDI\s*=\s*(\{\{[^\}]+\}\})?\s*(\d+)\.(\d+)/) {
      $hdi = "$1 $2,$3";
      $hdi =~ s/^\s+//;
      $hdi =~ s/\s+$//;
    }
    if ($page =~ /\|\s*HDI_rank\s*=\s*[^\|]*?(\d*)/) {
      $hdi_rank = $1;
      $hdi_rank =~ s/(?<=\d)$/{{e}}/g;
    }
    if ($page =~ /\|\s*HDI_year\s*=\s*(\d+)/) {
      $hdi_year = $1;
    }
    if ($page =~ /\|\s*HDI_category\s*=\s*[^|]*?(low|medium|high)/) {
      $hdi_category = $1;
      $hdi_category =~ s/high/élevé/;
      $hdi_category =~ s/medium/moyen/;
      $hdi_category =~ s/low/bas/;
    }
  }
 
  $idh = "| IDH=$hdi\n";
  $idh .= "| IDH_année=$hdi_year\n";
  $idh .= "| IDH_catégorie=$hdi_category\n";
  $idh .= "| IDH_rang=$hdi_rank\n";
 
  $text =~ s/\|\s*(monnaie\s*=)/$idh| $1/s;
 
  Encode::_utf8_off($text);
  $summary = 'Robot : ajout des paramètres IDH dans {{Infobox Pays}}';
  Encode::_utf8_off($summary);
  $url = sprintf $edit_url, $article;
  $rep = $ua->post($url, [ wpTextbox1  => $text,
                                   wpMinoredit => '1',
                                   wpSummary   => $summary,
                                   wpEdittime  => $time,
                                   wpEditToken => $token ]);
 
  if ($rep->is_error) {
    print STDERR $rep->headers_as_string;
    die "La requête [$url] a échoué.\n", $rep->status_line, "\n";
  }
}