make_unicode: Add a table of downloaded files and their URLs.

Signed-off-by: Alexandre Julliard <julliard@winehq.org>
2024-09-01 21:53:58 +00:00 · 2022-06-24 10:58:21 +02:00 · 2022-06-24 10:58:21 +02:00 · cfaa28933b
parent 3ec7c467cd
commit cfaa28933b
1 changed files with 81 additions and 89 deletions
--- a/tools/make_unicode
+++ b/tools/make_unicode
@ -25,25 +25,27 @@ use Digest::SHA;
 use Encode;
 use Time::Local qw(timegm_modern);

-# base URLs for www.unicode.org files
 my $UNIVERSION = "14.0.0";
-my $UNIDATA  = "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip";
-my $UNIHAN   = "https://www.unicode.org/Public/$UNIVERSION/ucd/Unihan.zip";
-my $IDNADATA = "https://www.unicode.org/Public/idna/$UNIVERSION";
-my $JISDATA  = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS";
-my $KSCDATA  = "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC";
-my $MSDATA = "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498";
-my $MSCODEPAGES = "$MSDATA/Windows Supported Code Page Data Files.zip";
-
 my $CLDRVERSION = "41";
-my $CLDRDATA = "https://github.com/unicode-org/cldr/archive/refs/tags/release-$CLDRVERSION.zip";
-my $CLDR33DATA = "https://www.unicode.org/Public/cldr/33/cldr-common-33.0.zip";
-
 my $ISO639VERSION = "20220120";
-my $ISO639 =  "https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_Code_Tables_$ISO639VERSION.zip";
-
 my $TZVERSION = "2022a";
-my $TZDATA = "https://data.iana.org/time-zones/releases/tzdata$TZVERSION.tar.gz";
+
+my %data_files =
+(
+ ucd       => { url  => "https://www.unicode.org/Public/$UNIVERSION/ucd/UCD.zip", name => "UCD-$UNIVERSION.zip" },
+ unihan    => { url  => "https://www.unicode.org/Public/$UNIVERSION/ucd/Unihan.zip", name => "Unihan-$UNIVERSION.zip" },
+ idna      => { url  => "https://www.unicode.org/Public/idna/$UNIVERSION/IdnaMappingTable.txt", name => "IdnaMappingTable-$UNIVERSION.txt" },
+ cldr      => { url  => "https://github.com/unicode-org/cldr/archive/refs/tags/release-$CLDRVERSION.zip" },
+ cldr33    => { url  => "https://www.unicode.org/Public/cldr/33/cldr-common-33.0.zip" },
+ sorting   => { url  => "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498/Windows 10 Sorting Weight Table.txt" },
+ codepages => { url  => "https://download.microsoft.com/download/C/F/7/CF713A5E-9FBC-4FD6-9246-275F65C0E498/Windows Supported Code Page Data Files.zip" },
+ iso639    => { url  => "https://iso639-3.sil.org/sites/iso639-3/files/downloads/iso-639-3_Code_Tables_$ISO639VERSION.zip" },
+ ksx1001   => { url  => "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/KSC/KSX1001.TXT" },
+ jis0208   => { url  => "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT" },
+ jis0212   => { url  => "https://www.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0212.TXT" },
+ tzdata    => { url  => "https://data.iana.org/time-zones/releases/tzdata$TZVERSION.tar.gz" },
+);
+

 # Default char for undefined mappings
 my $DEF_CHAR = ord '?';
@ -1857,6 +1859,8 @@ my %joining_forms =
   "medial" => []
 );

+my $current_data_file;
+
 sub to_utf16(@)
 {
    my @ret;
@ -1877,56 +1881,44 @@ sub to_utf16(@)

 ################################################################
 # fetch a unicode.org file and open it
-sub open_data_file($$)
+sub open_data_file($@)
 {
-    my ($base, $name) = @_;
+    my ($id, $name) = @_;
+    my $data = $data_files{$id};
    my $cache = ($ENV{XDG_CACHE_HOME} || "$ENV{HOME}/.cache") . "/wine";
-    (my $dir = "$cache/$name") =~ s/\/[^\/]+$//;
-    my $suffix = ($base =~ /\/\Q$UNIVERSION\E/) ? "-$UNIVERSION" : "";
    local *FILE;

-    if ($base =~ /.*\/([^\/]+)\.zip$/)
+    my $url = $data->{url};
+    my $filename = "$cache/" . ($data->{name} || ($url =~ s/.*\/([^\/]+)$/$1/r));
+    unless (-f $filename)
    {
-        my $zip = "$1$suffix.zip";
-        unless (-f "$cache/$zip")
-        {
-            system "mkdir", "-p", $cache;
-            print "Fetching $base...\n";
-            !system "wget", "-q", "-O", "$cache/$zip", $base or die "cannot fetch $base";
-        }
-        open FILE, "-|", "unzip", "-p", "$cache/$zip", $name or die "cannot extract $name from $zip";
+        print "Fetching $url...\n";
+        system "mkdir", "-p", $cache;
+        !system "wget", "-q", "-O", $filename, $url or die "cannot fetch $url";
    }
-    elsif ($base =~ /.*\/([^\/]+)\.tar\.gz$/)
+
+    if ($filename =~ /\.zip$/)
    {
-        my $tar = "$1$suffix.tar.gz";
-        unless (-f "$cache/$tar")
-        {
-            system "mkdir", "-p", $cache;
-            print "Fetching $base...\n";
-            !system "wget", "-q", "-O", "$cache/$tar", $base or die "cannot fetch $base";
-        }
-        open FILE, "-|", "tar", "-x", "-f", "$cache/$tar", "-O", $name or die "cannot extract $name from $tar";
+        open FILE, "-|", "unzip", "-p", $filename, $name or die "cannot extract $name from $filename";
+    }
+    elsif ($filename =~ /\.tar\.gz$/)
+    {
+        open FILE, "-|", "tar", "-x", "-f", $filename, "-O", $name or die "cannot extract $name from $filename";
    }
    else
    {
-        (my $dest = "$cache/$name") =~ s/(.*)(\.[^\/.]+)$/$1$suffix$2/;
-        unless (-f $dest)
-        {
-            system "mkdir", "-p", $dir;
-            print "Fetching $base/$name...\n";
-            !system "wget", "-q", "-O", $dest, "$base/$name" or die "cannot fetch $base/$name";
-        }
-        open FILE, "<$dest" or die "cannot open $dest";
+        open FILE, "<$filename" or die "cannot open $filename";
    }
+    $current_data_file = $name ? "$url:$name" : $url;
    return *FILE;
 }

 ################################################################
 # load a unicode.org file as XML data
-sub load_xml_data_file($$)
+sub load_xml_data_file($@)
 {
-    my ($base, $name) = @_;
-    my $FILE = open_data_file( $base, $name );
+    my ($id, $name) = @_;
+    my $FILE = open_data_file( $id, $name );
    my $xml = XML::LibXML->load_xml( IO => $FILE );
    close FILE;
    return $xml;
@ -2053,7 +2045,7 @@ sub load_data()

    # now build mappings from the decomposition field of the Unicode database

-    my $UNICODE_DATA = open_data_file( $UNIDATA, "UnicodeData.txt" );
+    my $UNICODE_DATA = open_data_file( "ucd", "UnicodeData.txt" );
    while (<$UNICODE_DATA>)
    {
 	# Decode the fields ...
@ -2199,7 +2191,7 @@ sub load_data()

    # load the composition exclusions

-    my $EXCL = open_data_file( $UNIDATA, "CompositionExclusions.txt" );
+    my $EXCL = open_data_file( "ucd", "CompositionExclusions.txt" );
    while (<$EXCL>)
    {
        s/\#.*//;  # remove comments
@ -2217,7 +2209,7 @@ sub load_data()
    # load the IDNA mappings

    @idna_decomp_table = @decomp_compat_table;
-    my $IDNA = open_data_file( $IDNADATA, "IdnaMappingTable.txt" );
+    my $IDNA = open_data_file( "idna", "IdnaMappingTable.txt" );
    while (<$IDNA>)
    {
        s/\#.*//;  # remove comments
@ -2260,7 +2252,7 @@ sub load_data()

    # load the Unihan mappings

-    my $UNIHAN = open_data_file( $UNIHAN, "Unihan_Variants.txt" );
+    my $UNIHAN = open_data_file( "unihan", "Unihan_Variants.txt" );
    while (<$UNIHAN>)
    {
        s/\#.*//;  # remove comments
@ -2406,7 +2398,7 @@ sub dump_eucjp_codepage()
        }
    }

-    my $INPUT = open_data_file( $JISDATA, "JIS0208.TXT" );
+    my $INPUT = open_data_file( "jis0208" );
    while (<$INPUT>)
    {
        next if /^\#/;  # skip comments
@ -2421,7 +2413,7 @@ sub dump_eucjp_codepage()
    }
    close $INPUT;

-    $INPUT = open_data_file( $JISDATA, "JIS0212.TXT" );
+    $INPUT = open_data_file( "jis0212" );
    while (<$INPUT>)
    {
        next if /^\#/;  # skip comments
@ -2460,7 +2452,7 @@ sub dump_krwansung_codepage(@)
    add_mapping( 0xfe, 0xf8ea );
    add_mapping( 0xff, 0xf8eb );

-    my $INPUT = open_data_file( $KSCDATA, "KSX1001.TXT" );
+    my $INPUT = open_data_file( "ksx1001" );
    while (<$INPUT>)
    {
        next if /^\#/;  # skip comments
@ -2608,7 +2600,7 @@ sub dump_indic($)
    my $filename = shift;
    my @indic_table;

-    my $INPUT = open_data_file( $UNIDATA, "IndicSyllabicCategory.txt" );
+    my $INPUT = open_data_file( "ucd", "IndicSyllabicCategory.txt" );
    while (<$INPUT>)
    {
        next if /^\#/;  # skip comments
@ -2641,7 +2633,8 @@ sub dump_indic($)
    }
    close $INPUT;

-    $INPUT = open_data_file( $UNIDATA, "IndicPositionalCategory.txt" );
+    my $prev_data_file = $current_data_file;
+    $INPUT = open_data_file( "ucd", "IndicPositionalCategory.txt" );
    while (<$INPUT>)
    {
        next if /^\#/;  # skip comments
@ -2671,8 +2664,8 @@ sub dump_indic($)
    open OUTPUT,">$filename.new" or die "Cannot create $filename";
    print "Building $filename\n";
    print OUTPUT "/* Unicode Indic Syllabic Category */\n";
-    print OUTPUT "/* generated from $UNIDATA:IndicSyllabicCategory.txt */\n";
-    print OUTPUT "/*       and from $UNIDATA:IndicPositionalCategory.txt */\n";
+    print OUTPUT "/* generated from $prev_data_file */\n";
+    print OUTPUT "/*       and from $current_data_file */\n";
    print OUTPUT "/* DO NOT EDIT!! */\n\n";
    print OUTPUT "#include \"windef.h\"\n\n";

@ -2689,7 +2682,7 @@ sub dump_linebreak($)
    my $filename = shift;
    my @break_table;

-    my $INPUT = open_data_file( $UNIDATA, "LineBreak.txt" );
+    my $INPUT = open_data_file( "ucd", "LineBreak.txt" );
    while (<$INPUT>)
    {
        next if /^\#/;  # skip comments
@ -2736,7 +2729,7 @@ sub dump_linebreak($)
    open OUTPUT,">$filename.new" or die "Cannot create $filename";
    print "Building $filename\n";
    print OUTPUT "/* Unicode Line Break Properties */\n";
-    print OUTPUT "/* generated from $UNIDATA:LineBreak.txt */\n";
+    print OUTPUT "/* generated from $current_data_file */\n";
    print OUTPUT "/* DO NOT EDIT!! */\n\n";
    print OUTPUT "#include \"windef.h\"\n\n";

@ -2924,7 +2917,7 @@ sub dump_scripts($)
    my $script_index;
    my $i;

-    my $INPUT = open_data_file( $UNIDATA, "Scripts.txt" );
+    my $INPUT = open_data_file( "ucd", "Scripts.txt" );
    # Fill the table
    # Unknown script id is always 0, so undefined scripts are automatically treated as such
    while (<$INPUT>)
@ -2963,7 +2956,7 @@ sub dump_scripts($)
    open OUTPUT,">$header.new" or die "Cannot create $header";
    print "Building $header\n";
    print OUTPUT "/* Unicode Script IDs */\n";
-    print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
+    print OUTPUT "/* generated from $current_data_file */\n";
    print OUTPUT "/* DO NOT EDIT!! */\n\n";

    print OUTPUT "enum unicode_script_id {\n";
@ -2981,7 +2974,7 @@ sub dump_scripts($)
    open OUTPUT,">$filename.new" or die "Cannot create $header";
    print "Building $filename\n";
    print OUTPUT "/* Unicode Script IDs */\n";
-    print OUTPUT "/* generated from $UNIDATA:Scripts.txt */\n";
+    print OUTPUT "/* generated from $current_data_file */\n";
    print OUTPUT "/* DO NOT EDIT!! */\n\n";
    print OUTPUT "#include \"windef.h\"\n\n";

@ -2997,7 +2990,7 @@ sub dump_mirroring($)
    my $filename = shift;
    my @mirror_table = ();

-    my $INPUT = open_data_file( $UNIDATA, "BidiMirroring.txt" );
+    my $INPUT = open_data_file( "ucd", "BidiMirroring.txt" );
    while (<$INPUT>)
    {
        next if /^\#/;  # skip comments
@ -3015,7 +3008,7 @@ sub dump_mirroring($)
    open OUTPUT,">$filename.new" or die "Cannot create $filename";
    print "Building $filename\n";
    print OUTPUT "/* Unicode BiDi mirroring */\n";
-    print OUTPUT "/* generated from $UNIDATA:BidiMirroring.txt */\n";
+    print OUTPUT "/* generated from $current_data_file */\n";
    print OUTPUT "/* DO NOT EDIT!! */\n\n";
    print OUTPUT "#include \"windef.h\"\n\n";
    dump_two_level_mapping( "wine_mirror_map", 0, 16, @mirror_table );
@ -3030,7 +3023,7 @@ sub dump_bracket($)
    my $filename = shift;
    my @bracket_table;

-    my $INPUT = open_data_file( $UNIDATA, "BidiBrackets.txt" );
+    my $INPUT = open_data_file( "ucd", "BidiBrackets.txt" );
    while (<$INPUT>)
    {
        next if /^\#/;  # skip comments
@ -3052,7 +3045,7 @@ sub dump_bracket($)
    open OUTPUT,">$filename.new" or die "Cannot create $filename";
    print "Building $filename\n";
    print OUTPUT "/* Unicode Bidirectional Bracket table */\n";
-    print OUTPUT "/* generated from $UNIDATA:BidiBrackets.txt */\n";
+    print OUTPUT "/* generated from $current_data_file */\n";
    print OUTPUT "/* DO NOT EDIT!! */\n\n";
    print OUTPUT "#include \"windef.h\"\n\n";

@ -3069,7 +3062,7 @@ sub dump_shaping($)
    my $filename = shift;
    my @joining_table = @initial_joining_table;

-    my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
+    my $INPUT = open_data_file( "ucd", "ArabicShaping.txt" );
    while (<$INPUT>)
    {
        next if /^\#/;  # skip comments
@ -3088,7 +3081,7 @@ sub dump_shaping($)
    open OUTPUT,">$filename.new" or die "Cannot create $filename";
    print "Building $filename\n";
    print OUTPUT "/* Unicode Arabic shaping */\n";
-    print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
+    print OUTPUT "/* generated from $current_data_file */\n";
    print OUTPUT "/* DO NOT EDIT!! */\n\n";
    print OUTPUT "#include \"windef.h\"\n\n";

@ -3116,7 +3109,7 @@ sub dump_arabic_shaping($)
    my $filename = shift;
    my @joining_table = @initial_joining_table;

-    my $INPUT = open_data_file( $UNIDATA, "ArabicShaping.txt" );
+    my $INPUT = open_data_file( "ucd", "ArabicShaping.txt" );
    while (<$INPUT>)
    {
        next if /^\#/;  # skip comments
@ -3145,7 +3138,7 @@ sub dump_arabic_shaping($)
    open OUTPUT,">$filename.new" or die "Cannot create $filename";
    print "Building $filename\n";
    print OUTPUT "/* Unicode Arabic shaping */\n";
-    print OUTPUT "/* generated from $UNIDATA:ArabicShaping.txt */\n";
+    print OUTPUT "/* generated from $current_data_file */\n";
    print OUTPUT "/* DO NOT EDIT!! */\n\n";
    print OUTPUT "#include \"windef.h\"\n\n";

@ -3162,7 +3155,7 @@ sub dump_vertical($$)
    my ($filename, $unix) = @_;
    my @vertical_table;

-    my $INPUT = open_data_file( $UNIDATA, "VerticalOrientation.txt" );
+    my $INPUT = open_data_file( "ucd", "VerticalOrientation.txt" );
    while (<$INPUT>)
    {
        next if /^\#/;  # skip comments
@ -3195,7 +3188,7 @@ sub dump_vertical($$)
    open OUTPUT,">$filename.new" or die "Cannot create $filename";
    print "Building $filename\n";
    print OUTPUT "/* Unicode Vertical Orientation */\n";
-    print OUTPUT "/* generated from $UNIDATA:VerticalOrientation.txt */\n";
+    print OUTPUT "/* generated from $current_data_file */\n";
    print OUTPUT "/* DO NOT EDIT!! */\n\n";
    if ($unix)
    {
@ -3688,7 +3681,7 @@ sub dump_msdata_codepage($)
    $default_char = $DEF_CHAR;
    $default_wchar = $DEF_CHAR;

-    my $INPUT = open_data_file( $MSCODEPAGES, $filename ) or die "Cannot open $filename";
+    my $INPUT = open_data_file( "codepages", $filename );

    while (<$INPUT>)
    {
@ -3814,10 +3807,9 @@ sub cmp_compression

 ################################################################
 # build a binary sort keys table
-sub dump_sortkey_table($$)
+sub dump_sortkey_table($)
 {
-    my ($filename, $download) = @_;
-
+    my $filename = shift;
    my @keys;
    my ($part, $section, $subsection, $guid, $version, $ling_flag);
    my @multiple_weights;
@ -3836,7 +3828,7 @@ sub dump_sortkey_table($$)

    my %flags = ( "HAS_3_BYTE_WEIGHTS" => 0x01, "REVERSEDIACRITICS" => 0x10, "DOUBLECOMPRESSION" => 0x20, "INVERSECASING" => 0x40 );

-    my $KEYS = open_data_file( $MSDATA, $download );
+    my $KEYS = open_data_file( "sorting" );

    printf "Building $filename\n";

@ -3997,7 +3989,7 @@ sub dump_sortkey_table($$)
                next;
            }
        }
-        die "$download: $part.$section: unrecognized line $_\n";
+        die "$current_data_file: $part.$section: unrecognized line $_\n";
    }
    close $KEYS;

@ -4390,7 +4382,7 @@ sub convert_time_format($)
 sub load_iso639()
 {
    my %iso639;
-    my $DATA = open_data_file( $ISO639, "iso-639-3_Code_Tables_$ISO639VERSION/iso-639-3.tab" );
+    my $DATA = open_data_file( "iso639", "iso-639-3_Code_Tables_$ISO639VERSION/iso-639-3.tab" );
    while (<$DATA>)
    {
        if (/^\s*[a-z]{3}\s+[a-z]{3}\s+([a-z]{3})\s+([a-z]{2})\s/) { $iso639{$2} = $1; }
@ -4405,11 +4397,11 @@ sub load_iso639()
 sub build_locale_data()
 {
    my $base = "cldr-release-$CLDRVERSION";
-    my $suppl = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/supplementalData.xml" );
-    my $subtags = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/likelySubtags.xml" );
-    my $numbers = load_xml_data_file( $CLDRDATA, "$base/common/supplemental/numberingSystems.xml" );
+    my $suppl = load_xml_data_file( "cldr", "$base/common/supplemental/supplementalData.xml" );
+    my $subtags = load_xml_data_file( "cldr", "$base/common/supplemental/likelySubtags.xml" );
+    my $numbers = load_xml_data_file( "cldr", "$base/common/supplemental/numberingSystems.xml" );
    # obsolete phone data from CLDR version 33
-    my $phone = load_xml_data_file( $CLDR33DATA, "common/supplemental/telephoneCodeData.xml" );
+    my $phone = load_xml_data_file( "cldr33", "common/supplemental/telephoneCodeData.xml" );
    my %iso639 = load_iso639();
    $string_data = pack "S2", 0, 0;  # offset 0 == empty string

@ -4444,7 +4436,7 @@ sub build_locale_data()
        next if defined $loc->{alias};
        (my $file = $loc->{file} || $loc->{name}) =~ s/-/_/g;
        $file = "$base/" . ($loc->{dir} || "common") . "/main/$file.xml";
-        my $xml = load_xml_data_file( $CLDRDATA, $file );
+        my $xml = load_xml_data_file( "cldr", $file );
        $loc->{xml} = $xml;
        $loc->{language} ||= xml_query( $xml, "/ldml/identity/language/\@type" );
        $loc->{territory} ||= xml_query( $xml, "/ldml/identity/territory/\@type" );
@ -5350,7 +5342,7 @@ sub load_windows_zones()
    my $current_name;
    my %names;
    my $base = "cldr-release-$CLDRVERSION";
-    my $INPUT = open_data_file( $CLDRDATA, "$base/common/supplemental/windowsZones.xml" );
+    my $INPUT = open_data_file( "cldr", "$base/common/supplemental/windowsZones.xml" );
    while (<$INPUT>)
    {
        if (/<!-- +(\(UTC.*) -->.*/)
@ -5505,7 +5497,7 @@ sub dump_timezones($@)

    foreach my $filename (@_)
    {
-        my $FILE = open_data_file( $TZDATA, $filename );
+        my $FILE = open_data_file( "tzdata", $filename );
        my $zonename;
        while (<$FILE>)
        {
@ -5804,7 +5796,7 @@ dump_norm_table( "nls/normnfd.nls" );
 dump_norm_table( "nls/normnfkc.nls" );
 dump_norm_table( "nls/normnfkd.nls" );
 dump_norm_table( "nls/normidna.nls" );
-my $chartypes = dump_sortkey_table( "nls/sortdefault.nls", "Windows 10 Sorting Weight Table.txt" );
+my $chartypes = dump_sortkey_table( "nls/sortdefault.nls" );
 dump_locales( "nls/locale.nls", $chartypes );
 foreach my $file (@allfiles) { dump_msdata_codepage( $file ); }
 dump_eucjp_codepage();