make-charset-table.pl 7.77 KB
Newer Older
mjs's avatar
mjs committed
1 2
#!/usr/bin/perl -w

3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
# Copyright (C) 2003, 2004, 2005, 2006 Apple Computer, Inc. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1.  Redistributions of source code must retain the above copyright
#     notice, this list of conditions and the following disclaimer. 
# 2.  Redistributions in binary form must reproduce the above copyright
#     notice, this list of conditions and the following disclaimer in the
#     documentation and/or other materials provided with the distribution. 
# 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
#     its contributors may be used to endorse or promote products derived
#     from this software without specific prior written permission. 
#
# THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mjs's avatar
mjs committed
28

29
use strict;
mjs's avatar
mjs committed
30

31 32
my %aliasesFromCharsetsFile;
my %namesWritten;
mjs's avatar
mjs committed
33

34
my $output = "";
mjs's avatar
mjs committed
35

36 37 38
my $error = 0;

sub error ($)
mjs's avatar
mjs committed
39
{
40 41
    print STDERR @_, "\n";
    $error = 1;
mjs's avatar
mjs committed
42 43
}

darin's avatar
darin committed
44 45
sub emit_line
{
mjs's avatar
mjs committed
46
    my ($name, $prefix, $encoding, $flags) = @_;
47 48 49
 
    error "$name shows up twice in output" if $namesWritten{$name};
    $namesWritten{$name} = 1;
darin's avatar
darin committed
50
    
darin's avatar
darin committed
51
    $output .= "        { \"$name\", $prefix$encoding },\n";
darin's avatar
darin committed
52
}
mjs's avatar
mjs committed
53

mjs's avatar
mjs committed
54
sub process_platform_encodings
mjs's avatar
mjs committed
55
{
mjs's avatar
mjs committed
56 57 58
    my ($filename, $PlatformPrefix) = @_;
    my $baseFilename = $filename;
    $baseFilename =~ s|.*/||;
59
    
mjs's avatar
mjs committed
60
    my %seenPlatformNames;
61 62
    my %seenIANANames;
    
mjs's avatar
mjs committed
63
    open PLATFORM_ENCODINGS, $filename or die;
mjs's avatar
mjs committed
64
    
mjs's avatar
mjs committed
65
    while (<PLATFORM_ENCODINGS>) {
66
        chomp;
darin's avatar
darin committed
67 68
        s/\#.*$//;
        s/\s+$//;
darin's avatar
darin committed
69
        if (my ($PlatformName, undef, $flags, $IANANames) = /^(.+?)(, (.+))?: (.+)$/) {
70 71
            my %aliases;
            
mjs's avatar
mjs committed
72
            my $PlatformNameWithFlags = $PlatformName;
darin's avatar
darin committed
73
            if ($flags) {
mjs's avatar
mjs committed
74
                $PlatformNameWithFlags .= ", " . $flags;
darin's avatar
darin committed
75 76 77
            } else {
                $flags = "NoEncodingFlags";
            }
darin's avatar
darin committed
78
            error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformNameWithFlags};
mjs's avatar
mjs committed
79
            $seenPlatformNames{$PlatformNameWithFlags} = 1;
80 81 82

            # Build the aliases list.
            # Also check that no two names are part of the same entry in the charsets file.
darin's avatar
darin committed
83
            my @IANANames = split ", ", $IANANames;
darin's avatar
darin committed
84 85
            my $firstName = "";
            my $canonicalFirstName = "";
86
            my $prevName = "";
87
            for my $name (@IANANames) {
darin's avatar
darin committed
88
                if ($firstName eq "") {
89
                    if ($name !~ /^[-A-Za-z0-9_]+$/) {
mjs's avatar
mjs committed
90
                        error "$name, in $baseFilename, has illegal characters in it";
darin's avatar
darin committed
91 92 93 94 95
                        next;
                    }
                    $firstName = $name;
                } else {
                    if ($name !~ /^[a-z0-9]+$/) {
mjs's avatar
mjs committed
96
                        error "$name, in $baseFilename, has illegal characters in it (must be all lowercase alphanumeric)";
darin's avatar
darin committed
97 98 99
                        next;
                    }
                    if ($name le $prevName) {
mjs's avatar
mjs committed
100
                        error "$name comes after $prevName in $baseFilename, but everything must be in alphabetical order";
101 102
                    }
                    $prevName = $name;
103 104
                }
                
darin's avatar
darin committed
105 106
                my $canonicalName = lc $name;
                $canonicalName =~ tr/-_//d;
107
                
darin's avatar
darin committed
108
                $canonicalFirstName = $canonicalName if $canonicalFirstName eq "";
109
                
mjs's avatar
mjs committed
110
                error "$name is mentioned twice in $baseFilename" if $seenIANANames{$canonicalName};
darin's avatar
darin committed
111 112 113 114 115
                $seenIANANames{$canonicalName} = 1;
                
                $aliases{$canonicalName} = 1;
                next if !$aliasesFromCharsetsFile{$canonicalName};
                for my $alias (@{$aliasesFromCharsetsFile{$canonicalName}}) {
116 117 118
                    $aliases{$alias} = 1;
                }
                for my $otherName (@IANANames) {
darin's avatar
darin committed
119
                    next if $canonicalName eq $otherName;
120
                    if ($aliasesFromCharsetsFile{$otherName}
darin's avatar
darin committed
121 122
                        && $aliasesFromCharsetsFile{$canonicalName} eq $aliasesFromCharsetsFile{$otherName}
                        && $canonicalName le $otherName) {
mjs's avatar
mjs committed
123
                        error "$baseFilename lists both $name and $otherName under $PlatformName, but that aliasing is already specified in character-sets.txt";
124 125 126 127 128
                    }
                }
            }
            
            # write out
mjs's avatar
mjs committed
129
            emit_line($firstName, $PlatformPrefix, $PlatformName, $flags);
darin's avatar
darin committed
130
            for my $alias (sort keys %aliases) {
mjs's avatar
mjs committed
131
                emit_line($alias, $PlatformPrefix, $PlatformName, $flags) if $alias ne $canonicalFirstName;
132
            }
darin's avatar
darin committed
133
        } elsif (/^([a-zA-Z0-9_]+)(, (.+))?$/) {
mjs's avatar
mjs committed
134
            my $PlatformName = $1;
135
            
mjs's avatar
mjs committed
136 137
            error "Platform encoding name $PlatformName is mentioned twice in $baseFilename" if $seenPlatformNames{$PlatformName};
            $seenPlatformNames{$PlatformName} = 1;
darin's avatar
darin committed
138
        } elsif (/./) {
darin's avatar
darin committed
139
            error "syntax error in $baseFilename, line $.";
140
        }
mjs's avatar
mjs committed
141
    }
142
    
mjs's avatar
mjs committed
143
    close PLATFORM_ENCODINGS;
mjs's avatar
mjs committed
144 145
}

146 147
sub process_iana_charset 
{
darin's avatar
darin committed
148
    my ($canonical_name, @aliases) = @_;
149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
    
    return if !$canonical_name;
    
    my @names = sort $canonical_name, @aliases;
    
    for my $name (@names) {
        $aliasesFromCharsetsFile{$name} = \@names;
    }
}

sub process_iana_charsets
{
    my ($filename) = @_;
    
    open CHARSETS, $filename or die;
    
    my %seen;
    
    my $canonical_name;
    my @aliases;
    
darin's avatar
darin committed
170 171
    my %exceptions = ( isoir91 => 1, isoir92 => 1 );
    
mjs's avatar
mjs committed
172
    while (<CHARSETS>) {
173
        chomp;
ddkilzer's avatar
ddkilzer committed
174
        if ((my $new_canonical_name) = /Name: ([^ \t]*).*/) {
175
            $new_canonical_name = lc $new_canonical_name;
darin's avatar
darin committed
176
            $new_canonical_name =~ tr/a-z0-9//cd;
177 178
            
            error "saw $new_canonical_name twice in character-sets.txt", if $seen{$new_canonical_name};
darin's avatar
darin committed
179
            $seen{$new_canonical_name} = $new_canonical_name;
180
            
ddkilzer's avatar
ddkilzer committed
181
            process_iana_charset $canonical_name, @aliases;
darin's avatar
darin committed
182 183 184

            $canonical_name = $new_canonical_name;
            @aliases = ();
ddkilzer's avatar
ddkilzer committed
185
        } elsif ((my $new_alias) = /Alias: ([^ \t]*).*/) {
186
            $new_alias = lc $new_alias;
darin's avatar
darin committed
187
            $new_alias =~ tr/a-z0-9//cd;
188
            
ddkilzer's avatar
ddkilzer committed
189 190 191 192
            # do this after normalizing the alias, sometimes character-sets.txt
            # has weird escape characters, e.g. \b after None
            next if $new_alias eq "none";

darin's avatar
darin committed
193 194 195
            error "saw $new_alias twice in character-sets.txt $seen{$new_alias}, $canonical_name", if $seen{$new_alias} && $seen{$new_alias} ne $canonical_name && !$exceptions{$new_alias};
            push @aliases, $new_alias if !$seen{$new_alias};
            $seen{$new_alias} = $canonical_name;            
ddkilzer's avatar
ddkilzer committed
196
        }
mjs's avatar
mjs committed
197
    }
198
    
darin's avatar
darin committed
199
    process_iana_charset $canonical_name, @aliases;
200 201
    
    close CHARSETS;
mjs's avatar
mjs committed
202 203 204 205
}

# Program body

206
process_iana_charsets($ARGV[0]);
mjs's avatar
mjs committed
207
process_platform_encodings($ARGV[1], $ARGV[2]);
mjs's avatar
mjs committed
208

209
exit 1 if $error;
mjs's avatar
mjs committed
210

211 212 213 214 215 216 217 218 219 220
print <<EOF
// File generated by make-charset-table.pl. Do not edit!

#include "config.h"
#include "CharsetData.h"

namespace WebCore {

    const CharsetEntry CharsetTable[] = {
$output
darin's avatar
darin committed
221
        { 0, 0 }
222 223 224 225
    };

}
EOF