Benutzer:Jah/unicode.js

aus Wikipedia, der freien Enzyklopädie
Zur Navigation springen Zur Suche springen

Hinweis: Leere nach dem Veröffentlichen den Browser-Cache, um die Änderungen sehen zu können.

  • Firefox/Safari: Umschalttaste drücken und gleichzeitig Aktualisieren anklicken oder entweder Strg+F5 oder Strg+R (⌘+R auf dem Mac) drücken
  • Google Chrome: Umschalttaste+Strg+R (⌘+Umschalttaste+R auf dem Mac) drücken
  • Internet Explorer/Edge: Strg+F5 drücken oder Strg drücken und gleichzeitig Aktualisieren anklicken
  • Opera: Strg+F5
#!/usr/bin/perl -w


$nChar = 17*65536;

@generalCategory = (
	"Lu", "Letter, Uppercase",
	"Ll", "Letter, Lowercase",
	"Lt", "Letter, Titlecase",
	"Lm", "Letter, Modifier",
	"Lo", "Letter, Other",
	"Mn", "Mark, Nonspacing",
	"Mc", "Mark, Spacing Combining",
	"Me", "Mark, Enclosing",
	"Nd", "Number, Decimal Digit",
	"Nl", "Number, Letter",
	"No", "Number, Other",
	"Pc", "Punctuation, Connector",
	"Pd", "Punctuation, Dash",
	"Ps", "Punctuation, Open",
	"Pe", "Punctuation, Close",
	"Pi", "Punctuation, Initial quote (may behave like Ps or Pe depending on usage)",
	"Pf", "Punctuation, Final quote (may behave like Ps or Pe depending on usage)",
	"Po", "Punctuation, Other",
	"Sm", "Symbol, Math",
	"Sc", "Symbol, Currency",
	"Sk", "Symbol, Modifier",
	"So", "Symbol, Other",
	"Zs", "Separator, Space",
	"Zl", "Separator, Line",
	"Zp", "Separator, Paragraph",
	"Cc", "Other, Control",
	"Cf", "Other, Format",
	"Cs", "Other, Surrogate",
	"Co", "Other, Private Use",
	"Cn", "Other, Not Assigned (no characters in the file have this property)",
);

for($i=0; $i<@generalCategory; $i+=2) {
	$generalCategoryNr{$generalCategory[$i]} = $i/2;
}

open IN, "UnicodeData.txt";
while(<IN>) {
	@character = split /;/;
	$value = hex $character[0];
	next if $value >= $nChar;
	if($character[1] =~ /First>$/) {
		$first = $value;
		next;
	}
	$GC = $generalCategoryNr{$character[2]};
	$GC |= 0x80 if $character[1] =~ /CJK/;
	if($character[1] =~ /Last>$/) {
		for(my $i=$first; $i<=$value; $i++) {
			$GC[$i] = $GC;
		}
	} else {
		$GC[$value] = $GC;
	}
}
close IN;

for($i=0; $i<$nChar; $i++) {
	$GC[$i] = $generalCategoryNr{"Cn"} unless defined $GC[$i];
}

open Alpha, ">alphaRanges";
open Alpha2, ">alphaRanges2";
open CJK, ">CJKRanges";
open CJK2, ">CJKRanges2";
binmode Alpha, ":utf8";
binmode CJK, ":utf8";
$isAlphaLast = 0;
$isCJKLast = 0;
for($i=0; $i<$nChar; $i++) {
	$isAlpha = $GC[$i]<0x80 && $generalCategory[2*$GC[$i]] =~ /^[LN]/;
	$isCJK = $GC[$i]>=0x80;
	if($isAlpha && !$isAlphaLast) {
		print Alpha chr($i);
		$first = $i;
	}
	if($isCJK && !$isCJKLast) {
		print CJK chr($i);
		$firstCJK = $i;
	}
	if(!$isAlpha && $isAlphaLast) {
		if($i>$first+2) {
			print Alpha "-".chr($i-1);
		} elsif($i==$first+2) {
			print Alpha chr($i-1);
		}
		if($i>=$first+2) {
			printf Alpha2 "\t\t\t|| (c>=0x%x && c<=0x%x)\n", $first, $i-1;
		} else {
			printf Alpha2 "\t\t\t|| c==0x%x\n", $first;
		}
	}
	if(!$isCJK && $isCJKLast) {
		if($i>$firstCJK+2) {
			print CJK "-".chr($i-1);
		} elsif($i==$firstCJK+2) {
			print CJK chr($i-1);
		}
		if($i>=$firstCJK+2) {
			printf CJK2 "\t\t\t|| (c>=0x%x && c<=0x%x)\n", $firstCJK, $i-1;
		} else {
			printf CJK2 "\t\t\t|| c==0x%x\n", $firstCJK;
		}
	}
	$isAlphaLast = $isAlpha;
	$isCJKLast = $isCJK;
}
print Alpha "\n";
print CJK "\n";
close Alpha; close Alpha2; close CJK; close CJK2;

open OUT, ">unicodeAttributes.h";

for($i=0; $i<@generalCategory; $i+=2) {
	printf OUT "#define unicode_%s 0x%02x\t\t/* %s */\n", $generalCategory[$i], $i/2, $generalCategory[$i+1];
}

print OUT "\nunsigned char unicodeGC[$nChar] = {\n";
for($i=0; $i<$nChar/16; $i++) {
	print OUT "\t", join(",", map { sprintf "0x%02x", $_ } @GC[16*$i..16*$i+15]), ($i<$nChar/16-1?",\n":"\n");
}
print OUT "};\n";

close OUT;