Benutzer:Jah/hauptautoren.js

aus Wikipedia, der freien Enzyklopädie
Zur Navigation springen Zur Suche springen

Hinweis: Leere nach dem Veröffentlichen den Browser-Cache, um die Änderungen sehen zu können.

  • Firefox/Safari: Umschalttaste drücken und gleichzeitig Aktualisieren anklicken oder entweder Strg+F5 oder Strg+R (⌘+R auf dem Mac) drücken
  • Google Chrome: Umschalttaste+Strg+R (⌘+Umschalttaste+R auf dem Mac) drücken
  • Internet Explorer/Edge: Strg+F5 drücken oder Strg drücken und gleichzeitig Aktualisieren anklicken
  • Opera: Strg+F5
#!/usr/bin/perl -w

use utf8;

$lws  = 5;			# length of word sequence
$lang = "de";			# en, de
$mkImg = 1;			# 1: make history image, 0: don't
$histDir = "histcache";		# where to save the revision histories
$wiki = "de.wikipedia.org";	# can be changed with e.g. "-w de.wikibooks.org"
$imgUrlDir = "http://localhost/hauptautoren";

if($^O eq "linux") {
	$imgFileDir = "/var/www/html/hauptautoren";
	$netpbmPath = "";	# pnmtopng etc. should be in the system path
} else { # Windows
	$imgFileDir = "c:\\Programme\\Apache Software Foundation\\Apache2.2\\html\\hauptautoren";
	$netpbmPath = "c:\\Programme\\GnuWin32\\bin";
}

########## end of configuration #################################

use Digest::MD5 qw(md5_hex);
use Compress::Zlib;
use Encode;
use CGI qw(:standard);
use LWP;
$ua = LWP::UserAgent->new;
use open ":utf8"; binmode STDOUT, ":utf8";
use Cwd;
use File::Spec::Functions;
use Getopt::Std;

getopts('w:');
if(defined $opt_w) {
	$wiki = $opt_w;
}

if ($lang eq "en") {
	$category = "Category";
	$image = "Image";
	$words = "words";
	$fraction = "fraction";
	$user = "user";
} elsif ($lang eq "de") {
	$category = "Kategorie";
	$image = "Bild";
	$words = "Wörter";
	$fraction = "Anteil";
	$user = "Benutzer";
} else {
	die;
}

$| = 1;

$stop = 0;
$SIG{INT} = sub { $stop = 1 };
$SIG{PIPE} = sub { $stop = 1 };

$sNr = 0; $nWordsMax = 0;

import_names;

if(defined $ARGV[0]) {
	$title = $ARGV[0];
	$title =~ s/ /_/g;
	$title_md5 = md5_hex "$title";
	$cgi = 0;
	$imgFileDir = cwd;
	$imgFile = catfile($imgFileDir, "$title_md5.png");
	$imgUrl  = "file:$imgFile";
} elsif(defined $Q::page) {
	$title = $Q::page;
	$title =~ s/ /_/g;
	$title_md5 = md5_hex "$title";
	$cgi = 1;
	print header(-charset => 'utf-8');
	print "<!--\n";
	mkdir $imgFileDir if $mkImg && !-d $imgFileDir;
	$imgFile = catfile($imgFileDir, "hf.png");
	$imgUrl  = "$imgUrlDir/hf.png";
} else {
	die;
}

$allowedTags = 'b|big|blockquote|br|caption|center|cite|code|dd|del|div|dl|'.
	'dt|em|font|h1|h2|h3|h4|h5|h6|hr|i|ins|li|nowiki|ol|p|pre|ref|'.
	'references|rb|rp|rt|ruby|s|small|span|strike|strong|sub|sup|table|'.
	'td|th|tr|tt|u|ul|var';

sub progress {
	print $_[0];
}

$subdir = substr $title_md5, 0, 2;

$idLast = -1;
if(-f catfile($histDir, $wiki, $subdir, "$title_md5.seq")) {
	progress("Loading cached sequences ... ");
	open SEQ, catfile($histDir, $wiki, $subdir, "$title_md5.seq");
	while(<SEQ>) {
		chop;
		if(/^# (\d+)$/) {
			$idLast = $1;
		} else {
			@wordsIds = split;
			$seq = join(" ", @wordsIds[0..$lws-1]);
			$id{$seq} = [ split(/,/, $wordsIds[$lws]) ];
		}
	}
	close SEQ;
	progress("done.\n");
}

if($mkImg && -f catfile($histDir, $wiki, $subdir, "$title_md5.idh")) {
	progress("Loading cached author attribution info ... ");
	open IDH, catfile($histDir, $wiki, $subdir, "$title_md5.idh");
	binmode IDH;
	while(!eof(IDH)) {
		read IDH, $tmp, 4;
		$id = unpack('V', $tmp);
		read IDH, $tmp, 4;
		$nIdh = unpack('V', $tmp);
		push @nWords, $nIdh;
		$nWordsMax = $nIdh if $nIdh>$nWordsMax;
		read IDH, $tmp, 4;
		$idhBinGzLen = unpack('V', $tmp);
		read IDH, $idhBinGz, $idhBinGzLen;
		($gz, $status) = inflateInit(-WindowBits => 0 - MAX_WBITS);
		($idhBin, $status) = $gz->inflate($idhBinGz);
		$status==Z_STREAM_END or die $gz->msg();
		@idh = unpack("V[$nIdh]", $idhBin);
		push @idHist, [@idh];
	}
	close IDH;
	progress("done.\n");
	if(int(@idHist) != $idLast+1) {
		progress("Cache corrupted, must reanalyze.\n");
		unlink catfile($histDir, $wiki, $subdir, "$title_md5.seq");
		unlink catfile($histDir, $wiki, $subdir, "$title_md5.idh");
		%id = (); $idLast = -1;
		@nWords = (); @idHist = ();
	}
}

open RI, "perl loadhistory -w $wiki '$title' |";
$id=0;
progress("Analyzing history ...\n");
#open DBG, ">md5.txt";
while($revInfo = <RI>) {
	if($revInfo =~ /^# (.*)/) {
		$msg = $1;
		progress "\t$msg\n";
		if($msg =~ /^Error/) {
			close RI;
			exit;
		}
		next;
	}

	$revInfo0 = $revInfo;
	$revInfo0 =~ /user="(.*?)"/;
	if($id % 20 == 0) {
		$revInfo0 =~ /timestamp="(....)-(..)-(..)T(..):(..):(..)Z"/;
		progress("$1$2$3$4$5$6\n");
	}
	$author[$id] = $1;
	if($id<=$idLast && (!$mkImg || defined $idHist[$id])) {
		$id++;
		next;
	} else {
		analyzeText();
		$id++;
	}
	last if $stop;
}
#close DBG;
$id--;
analyzeText() if !defined $text0;
close RI;
if($stop) {
	progress("interrupted.\n");
	exit;
} else {
	progress("done.\n");
}

sub analyzeText {
	$revInfo0 =~ /pos=(\d+) len=(\d+)/;
	$pos = $1; $len = $2;
	open TXT, catfile($histDir, $wiki, $subdir, "$title_md5.txt");
	binmode TXT;
	seek TXT, $pos, 0;
	read TXT, $textGz, $len;
	($gz, $status) = inflateInit(-WindowBits => 0 - MAX_WBITS);
	($text, $status) = $gz->inflate($textGz);
	$status==Z_STREAM_END or die $gz->msg();
	close TXT;
#	print DBG md5_hex("$text"), "\n";
	$text = Encode::decode_utf8($text);

	# convert &lt;, &gt;, &amp; and remove (inter-)wikilinks
	$text =~ s/&lt;/</sg; $text =~ s/&gt;/>/sg; $text =~ s/&amp;/&/sg;
	$text =~ s/\[\[(.{2,3}|minnan|simple|ru-sib|be-x-old|zh-yue|zh-min-nan|nds-nl|bat-smg|map-bms|zh-classical):(.+?)\]\]//sg;
	$text =~ s/\[\[(?!(?:$category|$image|category|image):)([^\]\|]*?\|)?([^\|]+?)\]\]/$2/sg;
	$text =~ s/\n{3,}/\n\n/sg;
	$text0 = $text;

	# remove elements not to be colored
	while($text =~ s/\{\{((?!\{\{).)*?\}\}/ /sg) {}
	while($text =~ s/\{\|((?!\{\|).)*?\|\}/ /sg) {}
	$text =~ s/\[\[($category|$image|category|image):.*?\]\]/ /isg;
	$text =~ s/\*? ?\[(http|ftp|mailto).*?\]/ /isg;
	$text =~ s/\*? ?(http|ftp|mailto):\S*/ /isg;
	$text =~ s/<math>.*?<\/math>/ /sg;
	$text =~ s/<\/?($allowedTags)(\s+.{1,200}?)?\/?>/ /sg;
	$text =~ s/&(\w+|#(\d+|x[0-9A-Fa-f]+));/ /sg;

	@words = ();
	while ($text =~ /[\p{IsAlpha}]+/sg) {
		push @words, $&;
	}
	$nWords[$id] = @words;
	$nWordsMax = @words>$nWordsMax?@words:$nWordsMax;
	@id = ();
	for ($i=0; $i<@words; $i++) {
		$id[$i] = $id;
	}
	for ($i=0; $i<@words-$lws+1; $i++) {
		$seq = join(" ", @words[$i..$i+$lws-1]);
		if (defined $id{$seq}) {
			for ($j=$i; $j<$i+$lws; $j++) {
				if ($id[$j]>$id{$seq}[$j-$i]) {
					$id[$j] = $id{$seq}[$j-$i];
				}
			}
		}
	}
	my %idNew = ();
	for ($i=0; $i<@words-$lws+1; $i++) {
		$seq = join(" ", @words[$i..$i+$lws-1]);
		if (!defined $id{$seq}) {
			for ($j=$i; $j<$i+$lws; $j++) {
				$id{$seq}[$j-$i] = $id[$j];
				$idNew{$seq}[$j-$i] = $id[$j];
			}
		}
	}
	open SEQ, ">>" . catfile($histDir, $wiki, $subdir, "$title_md5.seq");
	print SEQ "# $id\n";
	foreach $seq (keys %idNew) {
		print SEQ $seq, " ", join(",", @{$idNew{$seq}}), "\n";
	}
	close SEQ;

	if($mkImg && !defined $idHist[$id]) {
		$idHist[$id] = [@id];
		open IDH, ">>".catfile($histDir, $wiki, $subdir, "$title_md5.idh");
		binmode IDH;
#print int(@id), "\n";
		print IDH pack('V', $id);
		print IDH pack('V', int(@id));
		$idhBin = pack('V['.int(@id).']', @id);
		($gz, $status) = deflateInit(-WindowBits => 0 - MAX_WBITS);
		($idhBinGz1, $status) = $gz->deflate($idhBin);
		($idhBinGz2, $status) = $gz->flush();
		$idhBinGz = $idhBinGz1 . $idhBinGz2;
		print IDH pack('V', do { use bytes; length $idhBinGz });
		print IDH $idhBinGz;
		close IDH;
	}
}


for ($i=0; $i<@words; $i++) {
    $words{$author[$id[$i]]}++;
}
@authors = sort {$words{$b} <=> $words{$a}} keys %words;
for ($i=0; $i<@authors; $i++) {
    if ($i>5) {
	$color{$authors[$i]} = "#000000";
	$colorImg{$authors[$i]} = "\x00\x00\x00" if $mkImg;
    } else {
	$color{$authors[$i]} = ("#bf0000", "#00bf00", "#0000bf", "#007f7f",
				"#7f007f", "#7f7f00")[$i];
	$colorImg{$authors[$i]} = ("\xbf\x00\x00", "\x00\xbf\x00", "\x00\x00\xbf", "\x00\x7f\x7f",
				   "\x7f\x00\x7f", "\x7f\x7f\x00")[$i] if $mkImg;
    }
}

# compute history image
if ($mkImg) {
	progress("Computing image ...\n");
	open IMG, ">hf_tmp.ppm";
	binmode IMG;
	printf IMG "P6 %d %d 255\n", $id+1, $nWordsMax;
	for ($y=0; $y<$nWordsMax && !$stop; $y++) {
		print "$y/$nWordsMax\n" if $y%100==0;
		for ($x=0; $x<=$id; $x++) {
			if ($y<$nWords[$x]) {
				if (!defined $colorImg{$author[$idHist[$x][$y]]}) {
					print IMG "\x00\x00\x00";
				} else {
					print IMG $colorImg{$author[$idHist[$x][$y]]};
				}
			} else {
				print IMG "\xff\xff\xff";
			}
		}
	}
	close IMG;
	if(!$stop) {
		if(defined $netpbmPath && $netpbmPath ne "") {
			$pamscale = catfile($netpbmPath, "pamscale");
			$pnmtopng = catfile($netpbmPath, "pnmtopng");
		} else {
			$pamscale = "pamscale";
			$pnmtopng = "pnmtopng";
		}
		system "$pamscale -width 400 -height 400 hf_tmp.ppm > hf_tmp2.ppm";
		system "$pnmtopng hf_tmp2.ppm > $imgFile";
	}
	unlink "hf_tmp.ppm", "hf_tmp2.ppm";
	exit if $stop;
	progress("done.\n");
}

# mask elements not to be colored
sub subst {
	my $s = "___".$sNr++."___";
	$substBlock{$s} = $_[0];
	$s;
}
while($text0 =~ s/\{\{((?!\{\{).)*?\}\}/subst($&)/esg) {}
while($text0 =~ s/\{\|((?!\{\|).)*?\|\}/subst($&)/esg) {}
$text0 =~ s/\[\[($category|$image|category|image):.*?\]\]/subst($&)/iesg;
$text0 =~ s/\*? ?\[(http|ftp|mailto).*?\]/subst($&)/iesg;
$text0 =~ s/\*? ?(http|ftp|mailto):\S*/subst($&)/iesg;
$text0 =~ s/<math>.*?<\/math>/subst($&)/esg;
$text0 =~ s/<\/?($allowedTags)(\s+.{1,200}?)?\/?>/subst($&)/esg;
$text0 =~ s/&(\w+|#(\d+|x[0-9A-Fa-f]+));/subst($&)/esg;

# color the text
for ($i=0; $i<@words; $i++) {
	$text0 =~ s/^(.*?)$words[$i]//sg;
	$gap = $1;
	if ($i==0) {
		$coloredText = "$gap<font color=\"$color{$author[$id[$i]]}\">$words[$i]"
	} else {
		if ($gap =~ /^\s+$/ && $author[$id[$i]] eq $author[$id[$i-1]]) {
			$coloredText .= "$gap$words[$i]"
		} else {
			$coloredText .= "</font>$gap<font color=\"$color{$author[$id[$i]]}\">$words[$i]"
		}
	}
}
$coloredText .= "</font>$text0";

# fetch back masked elements
while ($coloredText =~ s/___\d+___/$substBlock{$&}/sg) {}

$stats = "";
if($mkImg) {
	$stats .= "<table cellspacing=\"10\">\n";
	$stats .= "<tr valign=\"top\"><td>\n";
}
$stats .= "<table cellspacing=\"0\" border=\"1\">\n";
$stats .= "<tr><th>$words</th><th>$fraction</th><th>$user</th></tr>\n";
for($i=0; $i<@authors; $i++) {
	$author = $authors[$i];
	$stats .= sprintf "<tr><td>%5i</td><td>%4.1f%%</td><td><a href=\"http://$wiki/wiki/$user:%s\" style=\"color:%s\">%s</a></td></tr>\n",
		$words{$author}, 100*$words{$author}/@words, $author, $color{$author}, $author;
	if($i<6) {
		$statsShort .= sprintf "<a href=\"http://$wiki/wiki/$user:%s\" style=\"color:%s\">%s</a> (%d)%s\n",
			$author, $color{$author}, $author, $words{$author}, $i<5?";":"";
	}
}
$stats .= "</table>\n";
if ($mkImg) {
	$stats .= "</td><td><img src=\"$imgUrl\"></td></tr>\n";
	$stats .= "</table>\n";
}

#open DBG, ">hf-debug.txt";
#print DBG $coloredText;
#close DBG;
exit if $stop;

progress("Sending preview request to $wiki ... ");
$url = "http://$wiki/w/index.php?title=$title&action=submit";
$response = $ua->post( $url, [
	wpTextbox1 => Encode::encode_utf8($coloredText),
	wpPreview => "Vorschau zeigen",
]);
$html = $response->decoded_content;
if($html =~ /Quelltext betrachten/) {
	progress("page is protected.\n");
	exit if $stop;
	progress("Sending another preview request to $wiki ... ");
	$url = "http://$wiki/w/index.php?title=${title}_tmp&action=submit";
	$response = $ua->post( $url, [
		wpTextbox1 => Encode::encode_utf8($coloredText),
		wpPreview => "Vorschau zeigen",
	]);
	$html = $response->decoded_content;
}
progress("done.\n");
exit if $stop;
$html =~ s/<head>/<head><base href="http:\/\/$wiki" \/>/s;
$html =~ s/<title>Bearbeiten von (.*?) - Vorschau - Wikipedia<\/title>/<title>$1 - Wikipedia<\/title>/s;
$html =~ s/<div class='previewnote'>.*?<\/div>//s;
$html =~ s/(<div id="wikiPreview">)<h2>.*?<\/h2>/$1/s;
$html =~ s/<h1 class="firstHeading">Bearbeiten von (.*?)<\/h1>/<h1 class="firstHeading">$1<\/h1>$statsShort<hr>/s;
$html =~ s/<p>Diese Seite ist \d+ kB groß\..*?<\/p>//s;
$html =~ s/<form id="editform".*?<\/form>/<p \/><hr \/><hr \/><p>$stats<\/p>/s;

if($cgi) {
	print "-->\n";
	print $html;
} else {
	open OUT, ">$title.html";
	print OUT $html;
	close OUT;
}