libwine: Add support in cpmap.pl for parsing the Windows bestfit codepage files.

2024-11-01 09:50:52 +00:00 · 2009-01-19 19:21:55 +01:00 · 2009-01-19 19:21:55 +01:00 · 97d31ec789
commit 97d31ec789
parent e098f427b4
1 changed files with 153 additions and 47 deletions
--- a/libs/wine/cpmap.pl
+++ b/libs/wine/cpmap.pl
@ -56,24 +56,24 @@ $DEF_CHAR = ord '?';
    [ 865,   "VENDORS/MICSFT/PC/CP865.TXT",       1, "OEM Nordic" ],
    [ 866,   "VENDORS/MICSFT/PC/CP866.TXT",       1, "OEM Russian" ],
    [ 869,   "VENDORS/MICSFT/PC/CP869.TXT",       1, "OEM Greek" ],
-    [ 874,   "VENDORS/MICSFT/PC/CP874.TXT",       1, "ANSI/OEM Thai" ],
+    [ 874,   "VENDORS/MICSFT/WindowsBestFit/bestfit874.txt",  1, "ANSI/OEM Thai" ],
-    [ 875,   "VENDORS/MICSFT/EBCDIC/CP875.TXT",   0, "IBM EBCDIC Greek" ],
+    [ 875,   "VENDORS/MICSFT/EBCDIC/CP875.TXT",               0, "IBM EBCDIC Greek" ],
-    [ 878,   "VENDORS/MISC/KOI8-R.TXT",           0, "Russian KOI8" ],
+    [ 878,   "VENDORS/MISC/KOI8-R.TXT",                       0, "Russian KOI8" ],
-    [ 932,   "VENDORS/MICSFT/WINDOWS/CP932.TXT",  0, "ANSI/OEM Japanese Shift-JIS" ],
+    [ 932,   "VENDORS/MICSFT/WindowsBestFit/bestfit932.txt",  0, "ANSI/OEM Japanese Shift-JIS" ],
-    [ 936,   "VENDORS/MICSFT/WINDOWS/CP936.TXT",  0, "ANSI/OEM Simplified Chinese GBK" ],
+    [ 936,   "VENDORS/MICSFT/WindowsBestFit/bestfit936.txt",  0, "ANSI/OEM Simplified Chinese GBK" ],
-    [ 949,   "VENDORS/MICSFT/WINDOWS/CP949.TXT",  0, "ANSI/OEM Korean Unified Hangul" ],
+    [ 949,   "VENDORS/MICSFT/WindowsBestFit/bestfit949.txt",  0, "ANSI/OEM Korean Unified Hangul" ],
-    [ 950,   "VENDORS/MICSFT/WINDOWS/CP950.TXT",  0, "ANSI/OEM Traditional Chinese Big5" ],
+    [ 950,   "VENDORS/MICSFT/WindowsBestFit/bestfit950.txt",  0, "ANSI/OEM Traditional Chinese Big5" ],
-    [ 1006,  "VENDORS/MISC/CP1006.TXT",           0, "IBM Arabic" ],
+    [ 1006,  "VENDORS/MISC/CP1006.TXT",                       0, "IBM Arabic" ],
-    [ 1026,  "VENDORS/MICSFT/EBCDIC/CP1026.TXT",  0, "IBM EBCDIC Latin 5 Turkish" ],
+    [ 1026,  "VENDORS/MICSFT/EBCDIC/CP1026.TXT",              0, "IBM EBCDIC Latin 5 Turkish" ],
-    [ 1250,  "VENDORS/MICSFT/WINDOWS/CP1250.TXT", 0, "ANSI Eastern Europe" ],
+    [ 1250,  "VENDORS/MICSFT/WindowsBestFit/bestfit1250.txt", 0, "ANSI Eastern Europe" ],
-    [ 1251,  "VENDORS/MICSFT/WINDOWS/CP1251.TXT", 0, "ANSI Cyrillic" ],
+    [ 1251,  "VENDORS/MICSFT/WindowsBestFit/bestfit1251.txt", 0, "ANSI Cyrillic" ],
-    [ 1252,  "VENDORS/MICSFT/WINDOWS/CP1252.TXT", 0, "ANSI Latin 1" ],
+    [ 1252,  "VENDORS/MICSFT/WindowsBestFit/bestfit1252.txt", 0, "ANSI Latin 1" ],
-    [ 1253,  "VENDORS/MICSFT/WINDOWS/CP1253.TXT", 0, "ANSI Greek" ],
+    [ 1253,  "VENDORS/MICSFT/WindowsBestFit/bestfit1253.txt", 0, "ANSI Greek" ],
-    [ 1254,  "VENDORS/MICSFT/WINDOWS/CP1254.TXT", 0, "ANSI Turkish" ],
+    [ 1254,  "VENDORS/MICSFT/WindowsBestFit/bestfit1254.txt", 0, "ANSI Turkish" ],
-    [ 1255,  "VENDORS/MICSFT/WINDOWS/CP1255.TXT", 0, "ANSI Hebrew" ],
+    [ 1255,  "VENDORS/MICSFT/WindowsBestFit/bestfit1255.txt", 0, "ANSI Hebrew" ],
-    [ 1256,  "VENDORS/MICSFT/WINDOWS/CP1256.TXT", 0, "ANSI Arabic" ],
+    [ 1256,  "VENDORS/MICSFT/WindowsBestFit/bestfit1256.txt", 0, "ANSI Arabic" ],
-    [ 1257,  "VENDORS/MICSFT/WINDOWS/CP1257.TXT", 0, "ANSI Baltic" ],
+    [ 1257,  "VENDORS/MICSFT/WindowsBestFit/bestfit1257.txt", 0, "ANSI Baltic" ],
-    [ 1258,  "VENDORS/MICSFT/WINDOWS/CP1258.TXT", 0, "ANSI/OEM Viet Nam" ],
+    [ 1258,  "VENDORS/MICSFT/WindowsBestFit/bestfit1258.txt", 0, "ANSI/OEM Viet Nam" ],
    [ 1361,  "OBSOLETE/EASTASIA/KSC/JOHAB.TXT",   0, "Korean Johab" ],
    [ 10000, "VENDORS/MICSFT/MAC/ROMAN.TXT",      0, "Mac Roman" ],
    [ 10006, "VENDORS/MICSFT/MAC/GREEK.TXT",      0, "Mac Greek" ],
@ -724,9 +724,9 @@ sub DUMP_ARRAY
 ################################################################
 # dump an SBCS mapping table
-sub DUMP_SBCS_TABLE
+sub dump_sbcs_table($$$$$)
 {
-    my ($codepage, $has_glyphs, $name) = @_;
+    my ($codepage, $has_glyphs, $name, $def, $defw) = @_;
    my $i;
    # output the ascii->unicode table
@ -734,14 +734,14 @@ sub DUMP_SBCS_TABLE
    if ($has_glyphs)
    {
        printf OUTPUT "static const WCHAR cp2uni[512] =\n";
-        printf OUTPUT "{\n%s", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
+        printf OUTPUT "{\n%s", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[0 .. 255] );
        printf OUTPUT ",\n    /* glyphs */\n%s\n};\n\n",
-                      DUMP_ARRAY( "0x%04x", $DEF_CHAR, get_glyphs_mapping(@cp2uni[0 .. 255]) );
+                      DUMP_ARRAY( "0x%04x", $defw, get_glyphs_mapping(@cp2uni[0 .. 255]) );
    }
    else
    {
        printf OUTPUT "static const WCHAR cp2uni[256] =\n";
-        printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
+        printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[0 .. 255] );
    }
    # count the number of unicode->ascii subtables that contain something
@ -763,10 +763,10 @@ sub DUMP_SBCS_TABLE
    {
        next unless $filled[$i];
        printf OUTPUT "    /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
-        printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%02x", $DEF_CHAR, @uni2cp[($i<<8) .. ($i<<8)+255] );
+        printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%02x", $def, @uni2cp[($i<<8) .. ($i<<8)+255] );
    }
    printf OUTPUT "    /* defaults */\n";
-    printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, ($DEF_CHAR) x 256 );
+    printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, ($def) x 256 );
    # output a table of the offsets of the subtables in the previous array
@ -784,7 +784,7 @@ sub DUMP_SBCS_TABLE
    printf OUTPUT "const struct sbcs_table cptable_%03d =\n{\n", $codepage;
    printf OUTPUT "    { %d, 1, 0x%04x, 0x%04x, \"%s\" },\n",
-                  $codepage, $DEF_CHAR, $DEF_CHAR, $name;
+                  $codepage, $def, $defw, $name;
    printf OUTPUT "    cp2uni,\n";
    if ($has_glyphs) { printf OUTPUT "    cp2uni + 256,\n"; }
    else { printf OUTPUT "    cp2uni,\n"; }
@ -795,9 +795,9 @@ sub DUMP_SBCS_TABLE
 ################################################################
 # dump a DBCS mapping table
-sub DUMP_DBCS_TABLE
+sub dump_dbcs_table($$$$@)
 {
-    my ($codepage, $name) = @_;
+    my ($codepage, $name, $def, $defw, @lb_ranges) = @_;
    my $i, $x, $y;
    # build a list of lead bytes that are actually used
@ -820,14 +820,14 @@ sub DUMP_DBCS_TABLE
    # output the ascii->unicode table for the single byte chars
    printf OUTPUT "static const WCHAR cp2uni[%d] =\n", 256 * ($#lblist + 2 + $unused);
-    printf OUTPUT "{\n%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
+    printf OUTPUT "{\n%s,\n", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[0 .. 255] );
    # output the default table for unused lead bytes
    if ($unused)
    {
        printf OUTPUT "    /* unused lead bytes */\n";
-        printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 );
+        printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 0, ($defw) x 256 );
    }
    # output the ascii->unicode table for each DBCS lead byte
@ -836,7 +836,7 @@ sub DUMP_DBCS_TABLE
    {
        my $base = $lblist[$y] << 8;
        printf OUTPUT "    /* lead byte %02x */\n", $lblist[$y];
-        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[$base .. $base+255] );
+        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", $defw, @cp2uni[$base .. $base+255] );
        printf OUTPUT ($y < $#lblist) ? ",\n" : "\n};\n\n";
    }
@ -872,10 +872,10 @@ sub DUMP_DBCS_TABLE
    {
        next unless $filled[$y];
        printf OUTPUT "    /* 0x%02x00 .. 0x%02xff */\n", $y, $y;
-        printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @uni2cp[($y<<8) .. ($y<<8)+255] );
+        printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", $def, @uni2cp[($y<<8) .. ($y<<8)+255] );
    }
    printf OUTPUT "    /* defaults */\n";
-    printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 );
+    printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, ($def) x 256 );
    # output a table of the offsets of the subtables in the previous array
@ -893,38 +893,38 @@ sub DUMP_DBCS_TABLE
    printf OUTPUT "const struct dbcs_table cptable_%03d =\n{\n", $codepage;
    printf OUTPUT "    { %d, 2, 0x%04x, 0x%04x, \"%s\" },\n",
-                  $codepage, $DEF_CHAR, $DEF_CHAR, $name;
+                  $codepage, $def, $defw, $name;
    printf OUTPUT "    cp2uni,\n";
    printf OUTPUT "    cp2uni_leadbytes,\n";
    printf OUTPUT "    uni2cp_low,\n";
    printf OUTPUT "    uni2cp_high,\n";
-    DUMP_LB_RANGES();
+    printf OUTPUT "    {\n    %s\n    }\n", DUMP_ARRAY( "0x%02x", 0, @lb_ranges, 0, 0 );
    printf OUTPUT "};\n";
 }
 ################################################################
-# dump the list of defined lead byte ranges
+# get the list of defined lead byte ranges
-sub DUMP_LB_RANGES
+sub get_lb_ranges()
 {
    my @list = ();
    my @ranges = ();
    my $i = 0;
    foreach $i (@lead_bytes) { $list[$i] = 1; }
    my $on = 0;
    printf OUTPUT "    { ";
    for ($i = 0; $i < 256; $i++)
    {
        if ($on)
        {
-            if (!defined $list[$i]) { printf OUTPUT "0x%02x, ", $i-1; $on = 0; }
+            if (!defined $list[$i]) { push @ranges, $i-1; $on = 0; }
        }
        else
        {
-            if ($list[$i]) { printf OUTPUT "0x%02x, ", $i; $on = 1; }
+            if ($list[$i]) { push @ranges, $i; $on = 1; }
        }
    }
-    if ($on) { printf OUTPUT "0xff, "; }
+    if ($on) { push @ranges, 0xff; }
-    printf OUTPUT "0x00, 0x00 }\n";
+    return @ranges;
 }
@ -1199,6 +1199,110 @@ sub DUMP_COMPOSE_TABLES
 }
 ################################################################
 # handle a "bestfit" Windows mapping file
 sub handle_bestfit_file($$$)
 {
    my ($filename, $has_glyphs, $comment) = @_;
    my $state = "";
    my ($codepage, $width, $def, $defw);
    my ($lb_cur, $lb_end);
    my @lb_ranges = ();
    open INPUT,$MAPPREFIX . $filename or die "Cannot open $name";
    while (<INPUT>)
    {
        next if /^;/;  # skip comments
        next if /^\s*$/;  # skip empty lines
        next if /\x1a/;  # skip ^Z
        last if /^ENDCODEPAGE/;
        if (/^CODEPAGE\s+(\d+)/)
        {
            $codepage = $1;
            next;
        }
        if (/^CPINFO\s+(\d+)\s+0x([0-9a-fA-f]+)\s+0x([0-9a-fA-F]+)/)
        {
            $width = $1;
            $def = hex $2;
            $defw = hex $3;
            next;
        }
        if (/^(MBTABLE|WCTABLE|DBCSRANGE|DBCSTABLE)\s+(\d+)/)
        {
            $state = $1;
            $count = $2;
            next;
        }
        if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)/)
        {
            if ($state eq "MBTABLE")
            {
                $cp = hex $1;
                $uni = hex $2;
                $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
                next;
            }
            if ($state eq "WCTABLE")
            {
                $uni = hex $1;
                $cp = hex $2;
                $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
                next;
            }
            if ($state eq "DBCSRANGE")
            {
                $start = hex $1;
                $end = hex $2;
                push @lb_ranges, $start, $end;
                for (my $i = $start; $i <= $end; $i++)
                {
                    push @lead_bytes, $i;
                    $cp2uni[$i] = 0;
                }
                $lb_cur = $start;
                $lb_end = $end;
                next;
            }
            if ($state eq "DBCSTABLE")
            {
                $mb = hex $1;
                $uni = hex $2;
                $cp = ($lb_cur << 8) | $mb;
                $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
                if (!--$count)
                {
                    if (++$lb_cur > $lb_end) { $state = "DBCSRANGE"; }
                }
                next;
            }
        }
        die "$name: Unrecognized line $_\n";
    }
    close INPUT;
    my $output = sprintf "c_%03d.c", $codepage;
    open OUTPUT,">$output.new" or die "Cannot create $output";
    printf "Building %s from %s (%s)\n", $output, $filename, $comment;
    # dump all tables
    printf OUTPUT "/* code page %03d (%s) */\n", $codepage, $comment;
    printf OUTPUT "/* generated from %s */\n", $MAPPREFIX . $filename;
    printf OUTPUT "/* DO NOT EDIT!! */\n\n";
    printf OUTPUT "#include \"wine/unicode.h\"\n\n";
    if ($width == 1) { dump_sbcs_table( $codepage, $has_glyphs, $comment, $def, $defw ); }
    else { dump_dbcs_table( $codepage, $comment, $def, $defw, @lb_ranges ); }
    close OUTPUT;
    save_file($output);
 }
 ################################################################
 # read an input file and generate the corresponding .c file
 sub HANDLE_FILE
@ -1212,11 +1316,13 @@ sub HANDLE_FILE
    # symbol codepage file is special
    if ($codepage == 20932) { READ_JIS0208_FILE($MAPPREFIX . $filename); }
    elsif ($codepage == 20127) { fill_20127_codepage(); }
    elsif ($filename =~ /\/bestfit/)
    {
        handle_bestfit_file( $filename, $has_glyphs, $comment );
        return;
    }
    else { READ_FILE($MAPPREFIX . $filename); }
    # hack: 0x00a5 must map to backslash in Shift-JIS
    if ($codepage == 932) { $uni2cp[0x00a5] = 0x5c; }
    ADD_DEFAULT_MAPPINGS();
    my $output = sprintf "c_%03d.c", $codepage;
@ -1238,8 +1344,8 @@ sub HANDLE_FILE
    }
    printf OUTPUT "#include \"wine/unicode.h\"\n\n";
-    if ($#lead_bytes == -1) { DUMP_SBCS_TABLE( $codepage, $has_glyphs, $comment ); }
+    if (!@lead_bytes) { dump_sbcs_table( $codepage, $has_glyphs, $comment, $DEF_CHAR, $DEF_CHAR ); }
-    else { DUMP_DBCS_TABLE( $codepage, $comment ); }
+    else { dump_dbcs_table( $codepage, $comment, $DEF_CHAR, $DEF_CHAR, get_lb_ranges() ); }
    close OUTPUT;
    save_file($output);
 }