Wikipédia:Requêtes XML/xml2sql

Un article de Wikipédia, l'encyclopédie libre.

Version sur le Wikipédia allemand : Xml2Sql

#!/usr/bin/perl

sub xmlunesc {
        my $text = shift;
        $text =~ s/</</sg;
        $text =~ s/>/>/sg;
        $text =~ s/&apos;/'/sg;
        $text =~ s/"/"/sg;
        $text =~ s/&/&/sg;
        $text;
}

sub sqlesc {
        my $text = shift;
        $text =~ s/\\/\\\\/sg;
        $text =~ s/\n/\\n/sg;
        $text =~ s/\r/\\r/sg;
        $text =~ s/\0/\\0/sg;
        $text =~ s/\x1A/\\Z/sg;
        $text =~ s/"/\\"/sg;
        $text =~ s/'/\\'/sg;
        $text;
}

sub xmlsiteinfo {
        while($xml =~ /<namespace key="(.*?)"(?: \/>|>(.*?)<\/namespace>)/sg) {
                $namespace{$2} = $1 if defined $2;
        }
        $xml = "";
}

sub xmlrevision {
        $xml =~ s/<revision>(.*?)<\/revision>//s;
        my ($revcontent, $rev) = ($1, {});
        xmlpage() unless defined $page;
        $xml = "";
        while($revcontent =~ /<(id|timestamp|contributor|minor|comment|text)(?: xml:space="preserve"| type="(.*?)")*(?:\s*\/>|>(.*?)<\/\1>)/sg) {
                my ($tag, $type, $content) = ($1, $2, $3);
                $content = "" unless defined $content;
                if($tag =~ /^text/) {
                        $rev->{"text"} = xmlunesc($content);
                } elsif($tag eq "contributor") {
                        if($content =~ /<username>(.*?)<\/username><id>(.*?)<\/id>/) {
                                $rev->{"user_text"} = xmlunesc($1);
                                $rev->{"user"} = $2;
                        } elsif($content =~ /<ip>(.*?)<\/ip>/) {
                                $rev->{"user_text"} = xmlunesc($1);
                                $rev->{"user"} = 0;
                        } else {
                                $rev->{"user_text"} = "_";
                                $rev->{"user"} = 0;
                        }
                } elsif($tag eq "comment") {
                        $rev->{"comment"} = xmlunesc($content);
                } elsif($tag eq "timestamp") {
                        $content =~ /^(....)-(..)-(..)T(..):(..):(..)Z$/;
                        $rev->{"timestamp"} = "$1$2$3$4$5$6";
                        $rev->{"inverse_timestamp"} = $rev->{"timestamp"};
                        $rev->{"inverse_timestamp"} =~ tr/0123456789/9876543210/;
                        $page->{"touched"} = $rev->{"timestamp"};
                } else {
                        $rev->{$tag} = $content;
                }
        }
        revision($page, $rev);
}

sub xmlpage {
        while($xml =~ s/<(title|id|restrictions)(?:\s*\/>|>(.*?)<\/\1>)//s) {
                my ($tag, $content) = ($1, $2);
                if($tag eq "title") {
                        $content = xmlunesc($content);
                        if($content =~ /(.+?):(.+)/ && defined $namespace{$1}) {
                                $page->{"namespace"} = $namespace{$1};
                                $content = $2;
                        } else {
                                $page->{"namespace"} = 0;
                        }
                }
                $page->{$tag} = defined $content ? $content : "";
        }
}

sub revision {
        my ($page, $rev) = @_;
        my $is_redirect = $rev->{"text"} =~ /^# ?redirect/i;
        my $len = length($rev->{"text"});
        return unless defined $page->{"title"};
        my $title = $page->{"title"};
        $title =~ s/\s/_/sg;
        if(defined $pageid{$page->{"id"}}) {
                print STDERR "duplicate key cur_id: ".$page->{"id"}."\n";
                return;
        }
        $ntdp = $page->{"namespace"}."+".$page->{"title"};
        if(defined $ntdp{$ntdp}) {
                print STDERR "duplicate key name_title_dup_prevention: $ntdp\n";
                return;
        }
        $rev->{"comment"} = "" unless defined $rev->{"comment"};
        $sql = sprintf "(%d,%d,'%s','%s','%s',%d,'%s','%s','%s',%d,%d,%d,%d,%f,'%s','%s')",
                $page->{"id"}, $page->{"namespace"}, sqlesc($title),
                sqlesc($rev->{"text"}), sqlesc($rev->{"comment"}), $rev->{"user"},
                sqlesc($rev->{"user_text"}), $rev->{"timestamp"},
                sqlesc($page->{"restrictions"}), 0, $is_redirect, defined $rev->{"minor"},
                0, rand(), $rev->{"inverse_timestamp"}, $page->{"touched"};
# Um Hauptspeicherbedarf um ca. 40% zu verringern kann man die folgende
# Zeile auskommentieren. Das ist problemlos möglich, solange sich MySQL nicht
# über einen doppelten Key beschwert und das Einlesen des Dumps abbricht.
        $pageid{$page->{"id"}}++;
        $ntdp{$ntdp}++;
        sqlflush() if $lsql+length($sql)>1000000;
        push @sql, $sql;
        $lsql += length($sql);
}

sub sqlflush {
        if(@sql>0) {
                print "INSERT INTO `cur` VALUES " . join(",", @sql) . ";\n";
                @sql = ();
                $lsql = 0;
        }
}

sub sqlheader {
        print <<"ABC";
DROP TABLE IF EXISTS `cur`;
CREATE TABLE `cur` (
  `cur_id` int(8) unsigned NOT NULL auto_increment,
  `cur_namespace` tinyint(2) unsigned NOT NULL default '0',
  `cur_title` varchar(255) binary NOT NULL default '',
  `cur_text` mediumtext NOT NULL,
  `cur_comment` tinyblob NOT NULL,
  `cur_user` int(5) unsigned NOT NULL default '0',
  `cur_user_text` varchar(255) binary NOT NULL default '',
  `cur_timestamp` varchar(14) binary NOT NULL default '',
  `cur_restrictions` tinyblob NOT NULL,
  `cur_counter` bigint(20) unsigned NOT NULL default '0',
  `cur_is_redirect` tinyint(1) unsigned NOT NULL default '0',
  `cur_minor_edit` tinyint(1) unsigned NOT NULL default '0',
  `cur_is_new` tinyint(1) unsigned NOT NULL default '0',
  `cur_random` double unsigned NOT NULL default '0',
  `inverse_timestamp` varchar(14) binary NOT NULL default '',
  `cur_touched` varchar(14) binary NOT NULL default '',
  UNIQUE KEY `cur_id` (`cur_id`),
  UNIQUE KEY `name_title_dup_prevention` (`cur_namespace`,`cur_title`),
  KEY `cur_title` (`cur_title`),
  KEY `cur_timestamp` (`cur_timestamp`),
  KEY `cur_random` (`cur_random`),
  KEY `name_title_timestamp` (`cur_namespace`,`cur_title`,`inverse_timestamp`),
  KEY `user_timestamp` (`cur_user`,`inverse_timestamp`),
  KEY `usertext_timestamp` (`cur_user_text`,`inverse_timestamp`),
  KEY `jamesspecialpages` (`cur_is_redirect`,`cur_namespace`,`cur_title`,`cur_timestamp`)
) TYPE=InnoDB PACK_KEYS=1;

/*!40000 ALTER TABLE `cur` DISABLE KEYS */;
LOCK TABLES `cur` WRITE;
ABC
}

sub sqlfooter {
        print "UNLOCK TABLES;\n";
        print "/*!40000 ALTER TABLE `cur` ENABLE KEYS */;\n";
}

$lsql = 0;
sqlheader();
while(<>) {
        $xml .= $_;
        if(/^\s*<\/siteinfo>/) {
                xmlsiteinfo();
        }
        next unless defined %namespace;
        if(/^\s*<\/revision>/) {
                xmlrevision();
        } elsif(/^\s*<\/page>/) {
                $xml = "";
                $page = undef;
        }
}
sqlflush();
sqlfooter();