//-------------------------------------------------------------------------------------------------------
// Copyright (C) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
//-------------------------------------------------------------------------------------------------------
#include "ParserPch.h"

namespace UnifiedRegex
{

    // ----------------------------------------------------------------------
    // ASCIIChars
    // ----------------------------------------------------------------------

/*
To get these two tables run:
  ch.exe ascii.js
where ascii.js is:
----------------------------------------------------------------------
function echo(s) { WScript.Echo(s); }

var NumChars = 1 << 8;

var Word = 1 << 0;
var Newline = 1 << 1;
var Whitespace = 1 << 2;
var Letter     = 1 << 3;
var Digit      = 1 << 4;
var Octal      = 1 << 5;
var Hex        = 1 << 6;

var classes = [];
var values = [];

function cc(s) {
    return s.charCodeAt(0);
}

var c;
for (c = 0; c < NumChars; c++)
{
    classes[c] = 0;
    values[c] = 0;
}
for (c = cc('0'); c <= cc('7'); c++)
{
    classes[c] |= Word | Octal | Digit | Hex;
    values[c] = c - cc('0');
}
for (c = cc('8'); c <= cc('9'); c++)
{
    classes[c] |= Word | Digit | Hex;
    values[c] = c - cc('0');
}
for (c = cc('a'); c <= cc('f'); c++)
{
    classes[c] |= Word | Hex | Letter;
    values[c] = 10 + c - cc('a');
}
for (c = cc('g'); c <= cc('z'); c++)
    classes[c] |= Word | Letter;
for (c = cc('A'); c <= cc('F'); c++)
{
    classes[c] |= Word | Hex | Letter;
    values[c] = 10 + c - cc('A');
}
for (c = cc('G'); c <= cc('Z'); c++)
    classes[c] |= Word | Letter;
classes[cc('_')] |= Word;

classes[cc('\n')] |= Newline;
classes[cc('\r')] |= Newline;

for (c = cc('\t'); c <= cc('\r'); c++)
    classes[c] |= Whitespace;
classes[cc(' ')] |= Whitespace;
classes[cc('\x85')] |= Whitespace;
classes[cc('\xa0')] |= Whitespace;

hex = "0123456789abcdef";
function toHex(n) {
    return "0x" + hex[n >> 4] + hex[n & 0xf];
}

function dump(a) {
    for (c = 0; c < NumChars; c++) {
        if (c % 16 == 0)
            str = "        ";
        else
            str += ", ";
        str += toHex(a[c]);
        if (c % 16 == 15)
        {
            if (c < NumChars - 1)
                str += ",";
            echo(str);
        }
    }
}

echo("    const uint8 ASCIIChars::classes[] = {");
dump(classes);
echo("    };");
echo("    const uint8 ASCIIChars::values[] = {");
dump(values);
echo("    };");
----------------------------------------------------------------------
*/

    const uint8 ASCIIChars::classes[] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x06, 0x04, 0x04, 0x06, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x71, 0x71, 0x71, 0x71, 0x71, 0x71, 0x71, 0x71, 0x51, 0x51, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x49, 0x49, 0x49, 0x49, 0x49, 0x49, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
        0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x00, 0x00, 0x00, 0x00, 0x01,
        0x00, 0x49, 0x49, 0x49, 0x49, 0x49, 0x49, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
        0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
    };
    const uint8 ASCIIChars::values[] = {
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
    };

    // ----------------------------------------------------------------------
    // TrivialCaseMapper
    // ----------------------------------------------------------------------

    const TrivialCaseMapper TrivialCaseMapper::Instance;

    // ----------------------------------------------------------------------
    // StandardChars<char16>
    // ----------------------------------------------------------------------

/*
To get the whitespaces string, run:
  gawk -f spaces.gawk http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
where spaces.gawk is
----------------------------------------------------------------------
BEGIN {
  FS = ";";
  start = -1;
  last = -1;
  str = "";
}
{
  code = strtonum("0x" $1);
  if ($3 == "Zs" || code == 0x0009 || code == 0x000B || code == 0x000C || code == 0x0020 || code == 0x00A0 || code == 0xFEFF || code == 0x000A || code == 0x000D || code == 0x2028 || code == 0x2029)
  {
    if (start < 0)
      start = code;
    else if (code > last + 1) {
      str = sprintf("%s\\x%04x\\x%04x", str, start, last);
      start = code;
    }
    last = code;
  }
}
END {
  str = sprintf("%s\\x%04x\\x%04x", str, start, last);
  print str;
}----------------------------------------------------------------------
*/

    const int StandardChars<char16>::numDigitPairs = 1;
    const char16* const StandardChars<char16>::digitStr = _u("09");
    const int StandardChars<char16>::numWhitespacePairs = 11;
    const char16* const StandardChars<char16>::whitespaceStr = _u("\x0009\x000d\x0020\x0020\x00a0\x00a0\x1680\x1680\x180e\x180e\x2000\x200a\x2028\x2029\x202f\x202f\x205f\x205f\x3000\x3000\xfeff\xfeff");
    const int StandardChars<char16>::numWordPairs = 4;
    const char16* const StandardChars<char16>::wordStr = _u("09AZ__az");
    const int StandardChars<char16>::numNewlinePairs = 3;
    const char16* const StandardChars<char16>::newlineStr = _u("\x000a\x000a\x000d\x000d\x2028\x2029");

    StandardChars<char16>::StandardChars(ArenaAllocator* allocator)
        : allocator(allocator)
        , unicodeDataCaseMapper(allocator, CaseInsensitive::MappingSource::UnicodeData, &TrivialCaseMapper::Instance)
        , caseFoldingCaseMapper(allocator, CaseInsensitive::MappingSource::CaseFolding, &unicodeDataCaseMapper)
        , fullSet(0)
        , emptySet(0)
        , wordSet(0)
        , nonWordSet(0)
        , newlineSet(0)
        , whitespaceSet(0)
    {
    }

    void StandardChars<char16>::SetDigits(ArenaAllocator* setAllocator, CharSet<Char> &set)
    {
        set.SetRanges(setAllocator, numDigitPairs, digitStr);
    }

    void StandardChars<char16>::SetNonDigits(ArenaAllocator* setAllocator, CharSet<Char> &set)
    {
        set.SetNotRanges(setAllocator, numDigitPairs, digitStr);
    }

    void StandardChars<char16>::SetWhitespace(ArenaAllocator* setAllocator, CharSet<Char> &set)
    {
        set.SetRanges(setAllocator, numWhitespacePairs, whitespaceStr);
    }

    void StandardChars<char16>::SetNonWhitespace(ArenaAllocator* setAllocator, CharSet<Char> &set)
    {
        set.SetNotRanges(setAllocator, numWhitespacePairs, whitespaceStr);
    }

    void StandardChars<char16>::SetWordChars(ArenaAllocator* setAllocator, CharSet<Char> &set)
    {
        set.SetRanges(setAllocator, numWordPairs, wordStr);
    }

    void StandardChars<char16>::SetNonWordChars(ArenaAllocator* setAllocator, CharSet<Char> &set)
    {
        set.SetNotRanges(setAllocator, numWordPairs, wordStr);
    }

    void StandardChars<char16>::SetNewline(ArenaAllocator* setAllocator, CharSet<Char> &set)
    {
        set.SetRanges(setAllocator, numNewlinePairs, newlineStr);
    }

    void StandardChars<char16>::SetNonNewline(ArenaAllocator* setAllocator, CharSet<Char> &set)
    {
        set.SetNotRanges(setAllocator, numNewlinePairs, newlineStr);
    }

    CharSet<char16>* StandardChars<char16>::GetFullSet()
    {
        if (fullSet == 0)
        {
            fullSet = Anew(allocator, UnicodeCharSet);
            fullSet->SetRange(allocator, MinChar, MaxChar);
        }
        return fullSet;
    }

    CharSet<char16>* StandardChars<char16>::GetEmptySet()
    {
        if (emptySet == 0)
        {
            emptySet = Anew(allocator, UnicodeCharSet);
            // leave empty
        }
        return emptySet;
    }

    CharSet<char16>* StandardChars<char16>::GetWordSet()
    {
        if (wordSet == 0)
        {
            wordSet = Anew(allocator, UnicodeCharSet);
            wordSet->SetRanges(allocator, numWordPairs, wordStr);
        }
        return wordSet;
    }

    CharSet<char16>* StandardChars<char16>::GetNonWordSet()
    {
        if (nonWordSet == 0)
        {
            nonWordSet = Anew(allocator, UnicodeCharSet);
            nonWordSet->SetNotRanges(allocator, numWordPairs, wordStr);
        }
        return nonWordSet;
    }

    CharSet<char16>* StandardChars<char16>::GetNewlineSet()
    {
        if (newlineSet == 0)
        {
            newlineSet = Anew(allocator, UnicodeCharSet);
            newlineSet->SetRanges(allocator, numNewlinePairs, newlineStr);
        }
        return newlineSet;
    }

    CharSet<char16>* StandardChars<char16>::GetWhitespaceSet()
    {
        if (whitespaceSet == 0)
        {
            whitespaceSet = Anew(allocator, UnicodeCharSet);
            whitespaceSet->SetRanges(allocator, numWhitespacePairs, whitespaceStr);
        }
        return whitespaceSet;
    }
    CharSet<char16>* StandardChars<char16>::GetSurrogateUpperRange()
    {
        if (surrogateUpperRange == 0)
        {
            surrogateUpperRange = Anew(allocator, UnicodeCharSet);
            surrogateUpperRange->SetRange(allocator, (char16)0xDC00u, (char16)0xDFFFu);
        }
        return surrogateUpperRange;
    }
}