|
|
@@ -56,155 +56,6 @@ namespace UnifiedRegex
|
|
|
}
|
|
|
};
|
|
|
|
|
|
-/*
|
|
|
-We first construct a total map from character codes to equivalence lists such that:
|
|
|
- - if ToUpper(c1) == ToUpper(c2) then c1 has c2 in its equivalence list
|
|
|
- - if c1 and c2 appear in the same equivalence list then c1 and c2 have equal equivalence lists
|
|
|
-
|
|
|
-We then compress the above map such that:
|
|
|
- - characters with singleton equivalence classes are elided
|
|
|
- - consecutive characters with consecutive equivalence lists are represented as a range and delta
|
|
|
- - the result is in strictly increasing range order
|
|
|
-
|
|
|
-Using gawk the above is:
|
|
|
- gawk -f equiv.gawk http://www.unicode.org/Public/8.0.0/ucd/UnicodeData.txt | gawk -f table.gawk
|
|
|
-
|
|
|
-Where equiv.gawk is:
|
|
|
-----------------------------------------------------------------------
|
|
|
-BEGIN {
|
|
|
- FS = ";";
|
|
|
- previncode = -1;
|
|
|
-}
|
|
|
-length($1) == 4 {
|
|
|
- incode = strtonum("0x" $1);
|
|
|
- for (i = previncode + 1; i < incode; i++)
|
|
|
- map[i] = i;
|
|
|
- if ($3 == "Ll" && $15 != "")
|
|
|
- {
|
|
|
- map[incode] = strtonum("0x" $15);
|
|
|
- # non-7-bit-ASCII cannot map to 7-bit-ASCII
|
|
|
- if (incode > 127 && map[incode] <= 127)
|
|
|
- map[incode] = incode;
|
|
|
- }
|
|
|
- else
|
|
|
- map[incode] = incode;
|
|
|
- previncode = incode;
|
|
|
-}
|
|
|
-END {
|
|
|
- for (i = previncode + 1; i <= 0xffff; i++)
|
|
|
- map[i] = i;
|
|
|
-
|
|
|
- for (i = 0x0000; i <= 0xffff; i++)
|
|
|
- ninv[i] = 0;
|
|
|
-
|
|
|
- for (i = 0x0000; i <= 0xffff; i++)
|
|
|
- {
|
|
|
- if (map[i] != i)
|
|
|
- ninv[map[i]]++;
|
|
|
- }
|
|
|
-
|
|
|
- maxninv = 0;
|
|
|
- for (i = 0x0000; i <= 0xffff; i++)
|
|
|
- {
|
|
|
- if (ninv[i] > maxninv)
|
|
|
- maxninv = ninv[i];
|
|
|
- }
|
|
|
- if (maxninv > 2)
|
|
|
- print "ERROR";
|
|
|
-
|
|
|
- for (i = 0x0000; i <= 0xffff; i++)
|
|
|
- inv[i] = "";
|
|
|
-
|
|
|
- for (i = 0x0000; i <= 0xffff; i++)
|
|
|
- {
|
|
|
- if (map[i] != i)
|
|
|
- inv[map[i]] = sprintf("%s;0x%04x", inv[map[i]], i);
|
|
|
- }
|
|
|
-
|
|
|
- for (i = 0x0000; i <= 0xffff; i++)
|
|
|
- {
|
|
|
- if (map[i] != i)
|
|
|
- {
|
|
|
- equiv[i] = sprintf("0x%04x%s", map[i], inv[map[i]]);
|
|
|
- nequiv[i] = 1 + ninv[map[i]];
|
|
|
- }
|
|
|
- else if (inv[i] != "")
|
|
|
- {
|
|
|
- equiv[i] = sprintf("0x%04x%s", i, inv[i]);
|
|
|
- nequiv[i] = 1 + ninv[i];
|
|
|
- }
|
|
|
- else
|
|
|
- {
|
|
|
- equiv[i] = sprintf("0x%04x", i);
|
|
|
- nequiv[i] = 1;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- nentries = 0
|
|
|
- for (i = 0x0000; i <= 0xffff; i++)
|
|
|
- {
|
|
|
- if (nequiv[i] > 1)
|
|
|
- {
|
|
|
- printf("0x%04x;%s\n", i, equiv[i]);
|
|
|
- nentries++;
|
|
|
- }
|
|
|
- }
|
|
|
- #printf("nentries = %d\n", nentries);
|
|
|
-}
|
|
|
-----------------------------------------------------------------------
|
|
|
-
|
|
|
-And table.gawk is:
|
|
|
-----------------------------------------------------------------------
|
|
|
-BEGIN {
|
|
|
- FS = ";";
|
|
|
- lastCode = -1;
|
|
|
- currStart = -1;
|
|
|
- for (i = 0; i < 3; i++)
|
|
|
- currDeltas[i] = "";
|
|
|
-}
|
|
|
-{
|
|
|
- if (NF > 4)
|
|
|
- print "ERROR"
|
|
|
-
|
|
|
- incode = strtonum($1);
|
|
|
- for (i = 0; i < NF - 1; i++)
|
|
|
- equivs[i] = strtonum($(i+2));
|
|
|
- for (i = NF - 1; i < 3; i++)
|
|
|
- equivs[i] = equivs[i - 1];
|
|
|
-
|
|
|
- #printf("0x%04x, 0x%04x, 0x%04x, 0x%04x\n", incode, equivs[0], equivs[1], equivs[2]);
|
|
|
-
|
|
|
- for (i = 0; i < 3; i++)
|
|
|
- deltas[i] = equivs[i] - incode;
|
|
|
-
|
|
|
- if (currStart < 0)
|
|
|
- {
|
|
|
- # start a new range
|
|
|
- currStart = incode;
|
|
|
- for (i = 0; i < 3; i++)
|
|
|
- currDeltas[i] = deltas[i]
|
|
|
- }
|
|
|
- else if (incode == lastCode + 1 && deltas[0] == currDeltas[0] && deltas[1] == currDeltas[1] && deltas[2] == currDeltas[2])
|
|
|
- {
|
|
|
- # keep accumulating range
|
|
|
- }
|
|
|
- else
|
|
|
- {
|
|
|
- # dump current range and start a new one
|
|
|
- printf(" 0x%04x, 0x%04x, %d, %d, %d,\n", currStart, lastCode, currDeltas[0], currDeltas[1], currDeltas[2]);
|
|
|
- currStart = incode;
|
|
|
- for (i = 0; i < 3; i++)
|
|
|
- currDeltas[i] = deltas[i]
|
|
|
- }
|
|
|
-
|
|
|
- lastCode = incode;
|
|
|
-}
|
|
|
-END {
|
|
|
- printf(" 0x%04x, 0x%04x, %d, %d, %d,\n", currStart, lastCode, currDeltas[0], currDeltas[1], currDeltas[2]);
|
|
|
-}
|
|
|
-----------------------------------------------------------------------
|
|
|
-*/
|
|
|
-
|
|
|
// For case-folding entries, version 8.0.0 of CaseFolding.txt located at [1] was used.
|
|
|
// [1] http://www.unicode.org/Public/8.0.0/ucd/CaseFolding.txt
|
|
|
static constexpr Transform transforms[] =
|