Add support in lexer for utf8 identifiers. No NFKC logic in char yet.

2024-11-05 20:45:15 +00:00 · 2011-12-29 14:45:18 -08:00 · 2011-12-29 14:45:18 -08:00 · 36c55b20a8
commit 36c55b20a8
parent 5fd0a3be0c
4 changed files with 48 additions and 5 deletions
--- a/doc/rust.texi
+++ b/doc/rust.texi
@ -595,9 +595,10 @@ otherwise defined as keywords or reserved
 tokens. @xref{Ref.Lex.Key}. @xref{Ref.Lex.Res}.

 That is: an identifier starts with any character having derived property
-@code{XID_Start} and continues with zero or more characters having derived
-property @code{XID_Continue}; and such an identifier is NFKC-normalized during
-lexing, such that all subsequent comparison of identifiers is performed on the
+@code{XID_Start}, or the character U+005F (underscore, @code{_}), and
+continues with zero or more characters having derived property
+@code{XID_Continue}. An identifier is NFKC-normalized during lexing, such
+that all subsequent comparison of identifiers is performed on the
 NFKC-normalized forms.

@emph{TODO: define relationship between Unicode and Rust versions}.
--- a/src/comp/syntax/parse/lexer.rs
+++ b/src/comp/syntax/parse/lexer.rs
@ -309,14 +309,16 @@ fn next_token(rdr: reader) -> {tok: token::token, chpos: uint, bpos: uint} {
 fn next_token_inner(rdr: reader) -> token::token {
    let accum_str = "";
    let c = rdr.curr();
-    if is_alpha(c) || c == '_' {
-        while is_alnum(c) || c == '_' {
+    if char::is_XID_start(c) || c == '_' {
+        while char::is_XID_continue(c) {
            str::push_char(accum_str, c);
            rdr.bump();
            c = rdr.curr();
        }
        if str::eq(accum_str, "_") { ret token::UNDERSCORE; }
        let is_mod_name = c == ':' && rdr.next() == ':';
+
+        // FIXME: perform NFKC normalization here.
        ret token::IDENT(interner::intern::<str>(*rdr.get_interner(),
                                                 accum_str), is_mod_name);
    }
--- a/src/libcore/char.rs
+++ b/src/libcore/char.rs
@ -37,6 +37,12 @@
    Cn  Unassigned  a reserved unassigned code point or a noncharacter
 */

+export is_alphabetic,
+       is_XID_start, is_XID_continue,
+       is_lowercase, is_uppercase,
+       is_whitespace, is_alphanumeric,
+       to_digit, maybe_digit, cmp;
+
 import is_alphabetic = unicode::derived_property::Alphabetic;
 import is_XID_start = unicode::derived_property::XID_Start;
 import is_XID_continue = unicode::derived_property::XID_Continue;
--- a/src/test/run-pass/utf8_idents.rs
+++ b/src/test/run-pass/utf8_idents.rs
@ -0,0 +1,34 @@
+fn main() {
+    let Π = 3.14;
+    let लंच = Π * Π + 1.54;
+    assert लंच - 1.54 == Π * Π;
+    assert საჭმელად_გემრიელი_სადილი() == 0;
+}
+
+fn საჭმელად_გემრიელი_სადილი() -> int {
+
+    // Lunch in several languages.
+
+    let ランチ = 10;
+    let 午餐 = 10;
+
+    let ארוחת_צהריי = 10;
+    let غداء = 10;
+    let լանչ = 10;
+    let обед = 10;
+    let абед = 10;
+    let μεσημεριανό = 10;
+    let hádegismatur = 10;
+    let ручек = 10;
+
+    let ăn_trưa = 10;
+    let อาหารกลางวัน = 10;
+
+    // Lunchy arithmetic, mm.
+
+    assert hádegismatur * ручек * обед == 1000;
+    assert 10 ==  ארוחת_צהריי;
+    assert ランチ + 午餐 + μεσημεριανό == 30;
+    assert ăn_trưa + อาหารกลางวัน == 20;
+    ret (абед + լանչ) >> غداء;
+}