Scan.cpp 76 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #include "ParserPch.h"
  6. /*****************************************************************************
  7. *
  8. * The following table speeds various tests of characters, such as whether
  9. * a given character can be part of an identifier, and so on.
  10. */
  11. int CountNewlines(LPCOLESTR psz, int cch)
  12. {
  13. int cln = 0;
  14. while (0 != *psz && 0 != cch--)
  15. {
  16. switch (*psz++)
  17. {
  18. case OLESTR('\xD'):
  19. if (*psz == OLESTR('\xA'))
  20. {
  21. ++psz;
  22. if (0 == cch--)
  23. break;
  24. }
  25. // fall-through
  26. case OLESTR('\xA'):
  27. cln++;
  28. break;
  29. }
  30. }
  31. return cln;
  32. }
  33. template< typename CharT >
  34. struct AorW
  35. {
  36. };
  37. // Specialization for UTF8Char
  38. template<>
  39. struct AorW< UTF8Char >
  40. {
  41. // Expressing the args as "arrays of size N" ensures that the both args
  42. // are the same length. If not, we get a compile time error.
  43. template< size_t N >
  44. static const UTF8Char* Choose( const char (&a)[N], const char16 (&w)[N] )
  45. {
  46. // The reinterpret_cast is necessary to go from signed to unsigned char
  47. return reinterpret_cast< const UTF8Char* >(a);
  48. }
  49. template< size_t N >
  50. static const bool Test(const char (&a)[N], const char16 (&w)[N], LPCUTF8 value)
  51. {
  52. return 0 == memcmp(a, value, (N - 1) * sizeof(utf8char_t));
  53. }
  54. template< size_t N >
  55. static const bool Test(const char (&a)[N], const char16 (&w)[N], LPCUTF8 start, LPCUTF8 end)
  56. {
  57. return (end - start == N - 1) && (0 == memcmp(a, start, (N - 1) * sizeof(utf8char_t)));
  58. }
  59. };
  60. // Specialization for OLECHAR
  61. template<>
  62. struct AorW< OLECHAR >
  63. {
  64. template< size_t N >
  65. static const char16* Choose( const char (&a)[N], const char16 (&w)[N] )
  66. {
  67. return w;
  68. }
  69. template < size_t N >
  70. static bool Test(const char (&a)[N], const char16 (&w)[N], const char16 *value)
  71. {
  72. return 0 == memcmp(w, value, (N - 1) * sizeof(char16));
  73. }
  74. template < size_t N >
  75. static bool Test(const char (&a)[N], const char16 (&w)[N], const char16 *start, const char16 *end)
  76. {
  77. return (end - start == N - 1) && (0 == memcmp(w, start, (N - 1) * sizeof(char16)));
  78. }
  79. };
  80. BOOL Token::IsKeyword() const
  81. {
  82. // keywords (but not future reserved words)
  83. return (tk <= tkYIELD);
  84. }
  85. tokens Token::SetRegex(UnifiedRegex::RegexPattern *const pattern, Parser *const parser)
  86. {
  87. Assert(parser);
  88. if(pattern)
  89. parser->RegisterRegexPattern(pattern);
  90. this->u.pattern = pattern;
  91. return tk = tkRegExp;
  92. }
  93. IdentPtr Token::CreateIdentifier(HashTbl * hashTbl)
  94. {
  95. Assert(this->u.pid == nullptr);
  96. if (this->u.pchMin)
  97. {
  98. Assert(IsIdentifier());
  99. IdentPtr pid = hashTbl->PidHashNameLen(this->u.pchMin, this->u.length);
  100. this->u.pid = pid;
  101. return pid;
  102. }
  103. Assert(IsReservedWord());
  104. IdentPtr pid = hashTbl->PidFromTk(tk);
  105. this->u.pid = pid;
  106. return pid;
  107. }
  108. template <typename EncodingPolicy>
  109. Scanner<EncodingPolicy>::Scanner(Parser* parser, HashTbl *phtbl, Token *ptoken, ErrHandler *perr, Js::ScriptContext* scriptContext)
  110. {
  111. AssertMem(phtbl);
  112. AssertMem(ptoken);
  113. AssertMem(perr);
  114. m_parser = parser;
  115. m_phtbl = phtbl;
  116. m_ptoken = ptoken;
  117. m_cMinLineMultiUnits = 0;
  118. m_perr = perr;
  119. m_fHadEol = FALSE;
  120. m_doubleQuoteOnLastTkStrCon = FALSE;
  121. m_OctOrLeadingZeroOnLastTKNumber = false;
  122. m_fStringTemplateDepth = 0;
  123. m_scanState = ScanStateNormal;
  124. m_scriptContext = scriptContext;
  125. m_line = 0;
  126. m_startLine = 0;
  127. m_pchStartLine = NULL;
  128. m_ichMinError = 0;
  129. m_ichLimError = 0;
  130. m_tempChBuf.m_pscanner = this;
  131. m_tempChBufSecondary.m_pscanner = this;
  132. m_iecpLimTokPrevious = (size_t)-1;
  133. this->charClassifier = scriptContext->GetCharClassifier();
  134. this->es6UnicodeMode = scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled();
  135. m_fYieldIsKeyword = false;
  136. m_fAwaitIsKeyword = false;
  137. }
  138. template <typename EncodingPolicy>
  139. Scanner<EncodingPolicy>::~Scanner(void)
  140. {
  141. }
  142. /*****************************************************************************
  143. *
  144. * Initializes the scanner to prepare to scan the given source text.
  145. */
  146. template <typename EncodingPolicy>
  147. void Scanner<EncodingPolicy>::SetText(EncodedCharPtr pszSrc, size_t offset, size_t length, charcount_t charOffset, ULONG grfscr, ULONG lineNumber)
  148. {
  149. // Save the start of the script and add the offset to get the point where we should start scanning.
  150. m_pchBase = pszSrc;
  151. m_pchLast = m_pchBase + offset + length;
  152. m_pchPrevLine = m_currentCharacter = m_pchMinLine = m_pchMinTok = pszSrc + offset;
  153. RestoreMultiUnits(offset - charOffset);
  154. // Absorb any byte order mark at the start
  155. if(offset == 0)
  156. {
  157. switch( PeekFull(m_currentCharacter, m_pchLast) )
  158. {
  159. case 0xFFEE: // "Opposite" endian BOM
  160. // We do not support big-endian encodings
  161. // fall-through
  162. case 0xFEFF: // "Correct" BOM
  163. ReadFull<true>(m_currentCharacter, m_pchLast);
  164. break;
  165. }
  166. }
  167. m_line = lineNumber;
  168. m_startLine = lineNumber;
  169. m_pchStartLine = m_currentCharacter;
  170. m_ptoken->tk = tkNone;
  171. m_fIsModuleCode = (grfscr & fscrIsModuleCode) != 0;
  172. m_fHadEol = FALSE;
  173. m_fSyntaxColor = (grfscr & fscrSyntaxColor) != 0;
  174. m_DeferredParseFlags = ScanFlagNone;
  175. }
  176. template <typename EncodingPolicy>
  177. void Scanner<EncodingPolicy>::PrepareForBackgroundParse(Js::ScriptContext *scriptContext)
  178. {
  179. scriptContext->GetThreadContext()->GetStandardChars((EncodedChar*)0);
  180. scriptContext->GetThreadContext()->GetStandardChars((char16*)0);
  181. }
  182. //-----------------------------------------------------------------------------
  183. // Number of code points from 'first' up to, but not including the next
  184. // newline character, embedded NUL, or 'last', depending on which comes first.
  185. //
  186. // This is used to determine a length of BSTR, which can't contain a NUL character.
  187. //-----------------------------------------------------------------------------
  188. template <typename EncodingPolicy>
  189. charcount_t Scanner<EncodingPolicy>::LineLength(EncodedCharPtr first, EncodedCharPtr last)
  190. {
  191. charcount_t result = 0;
  192. EncodedCharPtr p = first;
  193. for (;;)
  194. {
  195. switch( ReadFull<false>(p, last) )
  196. {
  197. case kchNWL: // _C_NWL
  198. case kchRET:
  199. case kchLS:
  200. case kchPS:
  201. case kchNUL: // _C_NUL
  202. return result;
  203. }
  204. result++;
  205. }
  206. }
  207. template <typename EncodingPolicy>
  208. charcount_t Scanner<EncodingPolicy>::UpdateLine(long &line, EncodedCharPtr start, EncodedCharPtr last, charcount_t ichStart, charcount_t ichEnd)
  209. {
  210. EncodedCharPtr p = start;
  211. charcount_t ich = ichStart;
  212. long current = line;
  213. charcount_t lastStart = ichStart;
  214. while (ich < ichEnd)
  215. {
  216. ich++;
  217. switch (ReadFull<false>(p, last))
  218. {
  219. case kchRET:
  220. if (PeekFull(p, last) == kchNWL)
  221. {
  222. ich++;
  223. ReadFull<false>(p, last);
  224. }
  225. // fall-through
  226. case kchNWL:
  227. case kchLS:
  228. case kchPS:
  229. current++;
  230. lastStart = ich;
  231. break;
  232. case kchNUL:
  233. goto done;
  234. }
  235. }
  236. done:
  237. line = current;
  238. return lastStart;
  239. }
  240. template <typename EncodingPolicy>
  241. bool Scanner<EncodingPolicy>::TryReadEscape(EncodedCharPtr& startingLocation, EncodedCharPtr endOfSource, codepoint_t *outChar)
  242. {
  243. Assert(outChar != nullptr);
  244. Assert(startingLocation <= endOfSource);
  245. EncodedCharPtr currentLocation = startingLocation;
  246. codepoint_t charToOutput = 0x0;
  247. // '\' is Assumed as there is only one caller
  248. // Read 'u' characters
  249. if (currentLocation >= endOfSource || ReadFirst(currentLocation, endOfSource) != 'u')
  250. {
  251. return false;
  252. }
  253. bool expectCurly = false;
  254. if (currentLocation < endOfSource && PeekFirst(currentLocation, endOfSource) == '{' && es6UnicodeMode)
  255. {
  256. expectCurly = true;
  257. // Move past the character
  258. ReadFirst(currentLocation, endOfSource);
  259. }
  260. uint i = 0;
  261. OLECHAR ch = 0;
  262. int hexValue = 0;
  263. uint maxHexDigits = (expectCurly ? MAXUINT32 : 4u);
  264. for(; i < maxHexDigits && currentLocation < endOfSource; i++)
  265. {
  266. if (!Js::NumberUtilities::FHexDigit(ch = ReadFirst(currentLocation, endOfSource), &hexValue))
  267. {
  268. break;
  269. }
  270. charToOutput = charToOutput * 0x10 + hexValue;
  271. if (charToOutput > 0x10FFFF)
  272. {
  273. return false;
  274. }
  275. }
  276. //At least 4 characters have to be read
  277. if (i == 0 || (i != 4 && !expectCurly))
  278. {
  279. return false;
  280. }
  281. Assert(expectCurly ? es6UnicodeMode : true);
  282. if (expectCurly && ch != '}')
  283. {
  284. return false;
  285. }
  286. *outChar = charToOutput;
  287. startingLocation = currentLocation;
  288. return true;
  289. }
  290. template <typename EncodingPolicy>
  291. template <bool bScan>
  292. bool Scanner<EncodingPolicy>::TryReadCodePointRest(codepoint_t lower, EncodedCharPtr& startingLocation, EncodedCharPtr endOfSource, codepoint_t *outChar, bool *outContainsMultiUnitChar)
  293. {
  294. Assert(outChar != nullptr);
  295. Assert(outContainsMultiUnitChar != nullptr);
  296. Assert(es6UnicodeMode);
  297. Assert(Js::NumberUtilities::IsSurrogateLowerPart(lower));
  298. EncodedCharPtr currentLocation = startingLocation;
  299. *outChar = lower;
  300. if (currentLocation < endOfSource)
  301. {
  302. size_t restorePoint = m_cMultiUnits;
  303. codepoint_t upper = ReadFull<bScan>(currentLocation, endOfSource);
  304. if (Js::NumberUtilities::IsSurrogateUpperPart(upper))
  305. {
  306. *outChar = Js::NumberUtilities::SurrogatePairAsCodePoint(lower, upper);
  307. if (IsMultiUnitChar(static_cast<OLECHAR>(upper)))
  308. {
  309. *outContainsMultiUnitChar = true;
  310. }
  311. startingLocation = currentLocation;
  312. }
  313. else
  314. {
  315. RestoreMultiUnits(restorePoint);
  316. }
  317. }
  318. return true;
  319. }
  320. template <typename EncodingPolicy>
  321. template <bool bScan>
  322. __inline bool Scanner<EncodingPolicy>::TryReadCodePoint(EncodedCharPtr &startingLocation, EncodedCharPtr endOfSource, codepoint_t *outChar, bool *hasEscape, bool *outContainsMultiUnitChar)
  323. {
  324. Assert(outChar != nullptr);
  325. Assert(outContainsMultiUnitChar != nullptr);
  326. if (startingLocation >= endOfSource)
  327. {
  328. return false;
  329. }
  330. codepoint_t ch = ReadFull<bScan>(startingLocation, endOfSource);
  331. if (FBigChar(ch))
  332. {
  333. if (IsMultiUnitChar(static_cast<OLECHAR>(ch)))
  334. {
  335. *outContainsMultiUnitChar = true;
  336. }
  337. if (es6UnicodeMode && Js::NumberUtilities::IsSurrogateLowerPart(ch))
  338. {
  339. return TryReadCodePointRest<bScan>(ch, startingLocation, endOfSource, outChar, outContainsMultiUnitChar);
  340. }
  341. }
  342. else if (ch == '\\' && TryReadEscape(startingLocation, endOfSource, &ch))
  343. {
  344. *hasEscape = true;
  345. }
  346. *outChar = ch;
  347. return true;
  348. }
  349. template <typename EncodingPolicy>
  350. tokens Scanner<EncodingPolicy>::ScanIdentifier(bool identifyKwds, EncodedCharPtr *pp)
  351. {
  352. EncodedCharPtr p = *pp;
  353. EncodedCharPtr pchMin = p;
  354. // JS6 allows unicode characters in the form of \uxxxx escape sequences
  355. // to be part of the identifier.
  356. bool fHasEscape = false;
  357. bool fHasMultiChar = false;
  358. codepoint_t codePoint = INVALID_CODEPOINT;
  359. size_t multiUnitsBeforeLast = m_cMultiUnits;
  360. // Check if we started the id
  361. if (!TryReadCodePoint<true>(p, m_pchLast, &codePoint, &fHasEscape, &fHasMultiChar))
  362. {
  363. // If no chars. could be scanned as part of the identifier, return error.
  364. return tkScanError;
  365. }
  366. Assert(codePoint < 0x110000u);
  367. if (!charClassifier->IsIdStart(codePoint))
  368. {
  369. // Put back the last character
  370. RestoreMultiUnits(multiUnitsBeforeLast);
  371. // If no chars. could be scanned as part of the identifier, return error.
  372. return tkScanError;
  373. }
  374. return ScanIdentifierContinue(identifyKwds, fHasEscape, fHasMultiChar, pchMin, p, pp);
  375. }
  376. template <typename EncodingPolicy>
  377. BOOL Scanner<EncodingPolicy>::FastIdentifierContinue(EncodedCharPtr&p, EncodedCharPtr last)
  378. {
  379. if (MultiUnitEncoding)
  380. {
  381. while (p < last)
  382. {
  383. EncodedChar currentChar = *p;
  384. if (IsMultiUnitChar(currentChar))
  385. {
  386. // multi unit character, we may not have reach the end yet
  387. return FALSE;
  388. }
  389. Assert(currentChar != '\\' || !charClassifier->IsIdContinueFast<false>(currentChar));
  390. if (!charClassifier->IsIdContinueFast<false>(currentChar))
  391. {
  392. // only reach the end of the identifier if it is not the start of an escape sequence
  393. return currentChar != '\\';
  394. }
  395. p++;
  396. }
  397. // We have reach the end of the identifier.
  398. return TRUE;
  399. }
  400. // Not fast path for non multi unit encoding
  401. return false;
  402. }
  403. template <typename EncodingPolicy>
  404. tokens Scanner<EncodingPolicy>::ScanIdentifierContinue(bool identifyKwds, bool fHasEscape, bool fHasMultiChar,
  405. EncodedCharPtr pchMin, EncodedCharPtr p, EncodedCharPtr *pp)
  406. {
  407. EncodedCharPtr last = m_pchLast;
  408. while (true)
  409. {
  410. // Fast path for utf8, non-multi unit char and not escape
  411. if (FastIdentifierContinue(p, last))
  412. {
  413. break;
  414. }
  415. // Slow path that has to deal with multi unit encoding
  416. codepoint_t codePoint = INVALID_CODEPOINT;
  417. EncodedCharPtr pchBeforeLast = p;
  418. size_t multiUnitsBeforeLast = m_cMultiUnits;
  419. if (TryReadCodePoint<true>(p, last, &codePoint, &fHasEscape, &fHasMultiChar))
  420. {
  421. Assert(codePoint < 0x110000u);
  422. if (charClassifier->IsIdContinue(codePoint))
  423. {
  424. continue;
  425. }
  426. }
  427. // Put back the last character
  428. p = pchBeforeLast;
  429. RestoreMultiUnits(multiUnitsBeforeLast);
  430. break;
  431. }
  432. Assert(p - pchMin > 0 && p - pchMin <= LONG_MAX);
  433. *pp = p;
  434. if (!identifyKwds)
  435. {
  436. return tkID;
  437. }
  438. // During syntax coloring, scanner doesn't need to convert the escape sequence to get actual characters, it just needs the classification information
  439. // So call up hashtables custom method to check if the string scanned is identifier or keyword.
  440. // Do the same for deferred parsing, but use a custom method that only tokenizes JS keywords.
  441. if ((m_DeferredParseFlags & ScanFlagSuppressIdPid) != 0)
  442. {
  443. m_ptoken->SetIdentifier(NULL);
  444. if (!fHasEscape)
  445. {
  446. // If there are no escape, that the main scan loop would have found the keyword already
  447. // So we can just assume it is an ID
  448. DebugOnly(long cch = UnescapeToTempBuf(pchMin, p));
  449. DebugOnly(tokens tk = m_phtbl->TkFromNameLen(m_tempChBuf.m_prgch, cch, IsStrictMode()));
  450. Assert(tk == tkID || (tk == tkYIELD && !m_fYieldIsKeyword) || (tk == tkAWAIT && !m_fAwaitIsKeyword));
  451. return tkID;
  452. }
  453. long cch = UnescapeToTempBuf(pchMin, p);
  454. tokens tk = m_phtbl->TkFromNameLen(m_tempChBuf.m_prgch, cch, IsStrictMode());
  455. return (!m_fYieldIsKeyword && tk == tkYIELD) || (!m_fAwaitIsKeyword && tk == tkAWAIT) ? tkID : tk;
  456. }
  457. else if (m_fSyntaxColor)
  458. {
  459. m_ptoken->SetIdentifier(NULL);
  460. // We always need to check TkFromNameLenColor because
  461. // the main Scan switch doesn't detect all non-keyword that needs coloring
  462. // (e.g. int)
  463. long cch = UnescapeToTempBuf(pchMin, p);
  464. return m_phtbl->TkFromNameLenColor(m_tempChBuf.m_prgch, cch);
  465. }
  466. // UTF16 Scanner are only for syntax coloring, so it shouldn't come here.
  467. if (MultiUnitEncoding && !fHasMultiChar && !fHasEscape)
  468. {
  469. Assert(sizeof(EncodedChar) == 1);
  470. // If there are no escape, that the main scan loop would have found the keyword already
  471. // So we can just assume it is an ID
  472. DebugOnly(long cch = UnescapeToTempBuf(pchMin, p));
  473. DebugOnly(tokens tk = m_phtbl->TkFromNameLen(m_tempChBuf.m_prgch, cch, IsStrictMode()));
  474. Assert(tk == tkID || (tk == tkYIELD && !m_fYieldIsKeyword) || (tk == tkAWAIT && !m_fAwaitIsKeyword));
  475. m_ptoken->SetIdentifier(reinterpret_cast<const char *>(pchMin), (long)(p - pchMin));
  476. return tkID;
  477. }
  478. IdentPtr pid = PidOfIdentiferAt(pchMin, p, fHasEscape, fHasMultiChar);
  479. m_ptoken->SetIdentifier(pid);
  480. if (!fHasEscape)
  481. {
  482. // If it doesn't have escape, then Scan() should have taken care of keywords (except
  483. // yield if m_fYieldIsKeyword is false, in which case yield is treated as an identifier, and except
  484. // await if m_fAwaitIsKeyword is false, in which case await is treated as an identifier).
  485. // We don't have to check if the name is reserved word and return it as an Identifier
  486. Assert(pid->Tk(IsStrictMode()) == tkID
  487. || (pid->Tk(IsStrictMode()) == tkYIELD && !m_fYieldIsKeyword)
  488. || (pid->Tk(IsStrictMode()) == tkAWAIT && !m_fAwaitIsKeyword));
  489. return tkID;
  490. }
  491. tokens tk = pid->Tk(IsStrictMode());
  492. return tk == tkID || (tk == tkYIELD && !m_fYieldIsKeyword) || (tk == tkAWAIT && !m_fAwaitIsKeyword) ? tkID : tkNone;
  493. }
  494. template <typename EncodingPolicy>
  495. IdentPtr Scanner<EncodingPolicy>::PidAt(size_t iecpMin, size_t iecpLim)
  496. {
  497. Assert(iecpMin < AdjustedLength() && iecpLim <= AdjustedLength() && iecpLim > iecpMin);
  498. return PidOfIdentiferAt(m_pchBase + iecpMin, m_pchBase + iecpLim);
  499. }
  500. template <typename EncodingPolicy>
  501. ulong Scanner<EncodingPolicy>::UnescapeToTempBuf(EncodedCharPtr p, EncodedCharPtr last)
  502. {
  503. m_tempChBuf.Init();
  504. while( p < last )
  505. {
  506. codepoint_t codePoint;
  507. bool hasEscape, isMultiChar;
  508. bool gotCodePoint = TryReadCodePoint<false>(p, last, &codePoint, &hasEscape, &isMultiChar);
  509. Assert(gotCodePoint);
  510. Assert(codePoint < 0x110000);
  511. if (codePoint < 0x10000)
  512. {
  513. m_tempChBuf.AppendCh((OLECHAR)codePoint);
  514. }
  515. else
  516. {
  517. char16 lower, upper;
  518. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &lower, &upper);
  519. m_tempChBuf.AppendCh(lower);
  520. m_tempChBuf.AppendCh(upper);
  521. }
  522. }
  523. return m_tempChBuf.m_ichCur;
  524. }
  525. template <typename EncodingPolicy>
  526. IdentPtr Scanner<EncodingPolicy>::PidOfIdentiferAt(EncodedCharPtr p, EncodedCharPtr last)
  527. {
  528. long cch = UnescapeToTempBuf(p, last);
  529. return m_phtbl->PidHashNameLen(m_tempChBuf.m_prgch, cch);
  530. }
  531. template <typename EncodingPolicy>
  532. IdentPtr Scanner<EncodingPolicy>::PidOfIdentiferAt(EncodedCharPtr p, EncodedCharPtr last, bool fHadEscape, bool fHasMultiChar)
  533. {
  534. // If there is an escape sequence in the JS6 identifier or it is a UTF8
  535. // source then we have to convert it to the equivalent char so we use a
  536. // buffer for translation.
  537. if ((MultiUnitEncoding && fHasMultiChar) || fHadEscape)
  538. {
  539. return PidOfIdentiferAt(p, last);
  540. }
  541. else if (MultiUnitEncoding)
  542. {
  543. Assert(sizeof(EncodedChar) == 1);
  544. return m_phtbl->PidHashNameLen(reinterpret_cast<const char *>(p), (long)(last - p));
  545. }
  546. else
  547. {
  548. Assert(sizeof(EncodedChar) == 2);
  549. return m_phtbl->PidHashNameLen(reinterpret_cast< const char16 * >(p), (long)(last - p));
  550. }
  551. }
  552. template <typename EncodingPolicy>
  553. typename Scanner<EncodingPolicy>::EncodedCharPtr Scanner<EncodingPolicy>::FScanNumber(EncodedCharPtr p, double *pdbl, bool& likelyInt)
  554. {
  555. EncodedCharPtr last = m_pchLast;
  556. EncodedCharPtr pchT;
  557. likelyInt = true;
  558. // Reset
  559. m_OctOrLeadingZeroOnLastTKNumber = false;
  560. if ('0' == PeekFirst(p, last))
  561. {
  562. switch(PeekFirst(p + 1, last))
  563. {
  564. case '.':
  565. case 'e':
  566. case 'E':
  567. likelyInt = false;
  568. // Floating point
  569. goto LFloat;
  570. case 'x':
  571. case 'X':
  572. // Hex
  573. *pdbl = Js::NumberUtilities::DblFromHex(p + 2, &pchT);
  574. if (pchT == p + 2)
  575. {
  576. // "Octal zero token "0" followed by an identifier token beginning with character 'x'/'X'
  577. *pdbl = 0;
  578. return p + 1;
  579. }
  580. else
  581. return pchT;
  582. case 'o':
  583. case 'O':
  584. // Octal
  585. *pdbl = Js::NumberUtilities::DblFromOctal(p + 2, &pchT);
  586. if (pchT == p + 2)
  587. {
  588. // "Octal zero token "0" followed by an identifier token beginning with character 'o'/'O'
  589. *pdbl = 0;
  590. return p + 1;
  591. }
  592. return pchT;
  593. case 'b':
  594. case 'B':
  595. // Binary
  596. *pdbl = Js::NumberUtilities::DblFromBinary(p + 2, &pchT);
  597. if (pchT == p + 2)
  598. {
  599. // "Octal zero token "0" followed by an identifier token beginning with character 'b'/'B'
  600. *pdbl = 0;
  601. return p + 1;
  602. }
  603. return pchT;
  604. default:
  605. // Octal
  606. *pdbl = Js::NumberUtilities::DblFromOctal(p, &pchT);
  607. Assert(pchT > p);
  608. #if !SOURCERELEASE
  609. // If an octal literal is malformed then it is in fact a decimal literal.
  610. #endif // !SOURCERELEASE
  611. if(*pdbl != 0 || pchT > p + 1)
  612. m_OctOrLeadingZeroOnLastTKNumber = true; //report as an octal or hex for JSON when leading 0. Just '0' is ok
  613. switch (*pchT)
  614. {
  615. case '8':
  616. case '9':
  617. // case 'e':
  618. // case 'E':
  619. // case '.':
  620. m_OctOrLeadingZeroOnLastTKNumber = false; //08... or 09....
  621. goto LFloat;
  622. }
  623. return pchT;
  624. }
  625. }
  626. else
  627. {
  628. LFloat:
  629. *pdbl = Js::NumberUtilities::StrToDbl(p, &pchT, likelyInt);
  630. Assert(pchT == p || !Js::NumberUtilities::IsNan(*pdbl));
  631. return pchT;
  632. }
  633. }
  634. template <typename EncodingPolicy>
  635. BOOL Scanner<EncodingPolicy>::oFScanNumber(double *pdbl, bool& likelyInt)
  636. {
  637. EncodedCharPtr pchT;
  638. m_OctOrLeadingZeroOnLastTKNumber = false;
  639. likelyInt = true;
  640. if ('0' == *m_currentCharacter)
  641. {
  642. switch (m_currentCharacter[1])
  643. {
  644. case '.':
  645. case 'e':
  646. case 'E':
  647. likelyInt = false;
  648. // Floating point.
  649. goto LFloat;
  650. case 'x':
  651. case 'X':
  652. // Hex.
  653. *pdbl = Js::NumberUtilities::DblFromHex<EncodedChar>(m_currentCharacter + 2, &pchT);
  654. if (pchT == m_currentCharacter + 2)
  655. {
  656. // "Octal zero token "0" followed by an identifier token beginning with character 'x'/'X'
  657. *pdbl = 0;
  658. m_currentCharacter++;
  659. }
  660. else
  661. m_currentCharacter = pchT;
  662. break;
  663. case 'o':
  664. case 'O':
  665. *pdbl = Js::NumberUtilities::DblFromOctal(m_currentCharacter + 2, &pchT);
  666. if (pchT == m_currentCharacter + 2)
  667. {
  668. // "Octal zero token "0" followed by an identifier token beginning with character 'o'/'O'
  669. *pdbl = 0;
  670. m_currentCharacter++;
  671. }
  672. else
  673. m_currentCharacter = pchT;
  674. break;
  675. case 'b':
  676. case 'B':
  677. *pdbl = Js::NumberUtilities::DblFromBinary(m_currentCharacter + 2, &pchT);
  678. if (pchT == m_currentCharacter + 2)
  679. {
  680. // "Octal zero token "0" followed by an identifier token beginning with character 'b'/'B'
  681. *pdbl = 0;
  682. m_currentCharacter++;
  683. }
  684. else
  685. m_currentCharacter = pchT;
  686. break;
  687. default:
  688. // Octal.
  689. *pdbl = Js::NumberUtilities::DblFromOctal(m_currentCharacter, &pchT);
  690. Assert(pchT > m_currentCharacter);
  691. #if !SOURCERELEASE
  692. // If an octal literal is malformed then it is in fact a decimal literal.
  693. #endif // !SOURCERELEASE
  694. if(*pdbl != 0 || pchT > m_currentCharacter + 1)
  695. m_OctOrLeadingZeroOnLastTKNumber = true; //report as an octal or hex for JSON when leading 0. Just '0' is ok
  696. switch (*pchT)
  697. {
  698. case '8':
  699. case '9':
  700. // case 'e':
  701. // case 'E':
  702. // case '.':
  703. m_OctOrLeadingZeroOnLastTKNumber = false; //08... or 09....
  704. goto LFloat;
  705. }
  706. m_currentCharacter = pchT;
  707. break;
  708. }
  709. }
  710. else
  711. {
  712. LFloat:
  713. // Let StrToDbl do all the work.
  714. *pdbl = Js::NumberUtilities::StrToDbl(m_currentCharacter, &pchT, likelyInt);
  715. if (pchT == m_currentCharacter)
  716. return FALSE;
  717. m_currentCharacter = pchT;
  718. Assert(!Js::NumberUtilities::IsNan(*pdbl));
  719. }
  720. return TRUE;
  721. }
  722. template <typename EncodingPolicy>
  723. tokens Scanner<EncodingPolicy>::TryRescanRegExp()
  724. {
  725. EncodedCharPtr current = m_currentCharacter;
  726. tokens result = RescanRegExp();
  727. if (result == tkScanError)
  728. m_currentCharacter = current;
  729. return result;
  730. }
  731. template <typename EncodingPolicy>
  732. tokens Scanner<EncodingPolicy>::RescanRegExp()
  733. {
  734. #if DEBUG
  735. switch (m_ptoken->tk)
  736. {
  737. case tkDiv:
  738. Assert(m_currentCharacter == m_pchMinTok + 1);
  739. break;
  740. case tkAsgDiv:
  741. Assert(m_currentCharacter == m_pchMinTok + 2);
  742. break;
  743. default:
  744. AssertMsg(FALSE, "Who is calling RescanRegExp?");
  745. break;
  746. }
  747. #endif //DEBUG
  748. m_currentCharacter = m_pchMinTok;
  749. if (*m_currentCharacter != '/')
  750. Error(ERRnoSlash);
  751. m_currentCharacter++;
  752. tokens tk = tkNone;
  753. {
  754. ArenaAllocator alloc(_u("RescanRegExp"), m_parser->GetAllocator()->GetPageAllocator(), m_parser->GetAllocator()->outOfMemoryFunc);
  755. tk = ScanRegExpConstant(&alloc);
  756. }
  757. return tk;
  758. }
  759. template <typename EncodingPolicy>
  760. tokens Scanner<EncodingPolicy>::RescanRegExpNoAST()
  761. {
  762. #if DEBUG
  763. switch (m_ptoken->tk)
  764. {
  765. case tkDiv:
  766. Assert(m_currentCharacter == m_pchMinTok + 1);
  767. break;
  768. case tkAsgDiv:
  769. Assert(m_currentCharacter == m_pchMinTok + 2);
  770. break;
  771. default:
  772. AssertMsg(FALSE, "Who is calling RescanRegExpNoParseTree?");
  773. break;
  774. }
  775. #endif //DEBUG
  776. m_currentCharacter = m_pchMinTok;
  777. if (*m_currentCharacter != '/')
  778. Error(ERRnoSlash);
  779. m_currentCharacter++;
  780. tokens tk = tkNone;
  781. {
  782. ArenaAllocator alloc(_u("RescanRegExp"), m_parser->GetAllocator()->GetPageAllocator(), m_parser->GetAllocator()->outOfMemoryFunc);
  783. {
  784. tk = ScanRegExpConstantNoAST(&alloc);
  785. }
  786. }
  787. return tk;
  788. }
  789. template <typename EncodingPolicy>
  790. tokens Scanner<EncodingPolicy>::RescanRegExpTokenizer()
  791. {
  792. #if DEBUG
  793. switch (m_ptoken->tk)
  794. {
  795. case tkDiv:
  796. Assert(m_currentCharacter == m_pchMinTok + 1);
  797. break;
  798. case tkAsgDiv:
  799. Assert(m_currentCharacter == m_pchMinTok + 2);
  800. break;
  801. default:
  802. AssertMsg(FALSE, "Who is calling RescanRegExpNoParseTree?");
  803. break;
  804. }
  805. #endif //DEBUG
  806. m_currentCharacter = m_pchMinTok;
  807. if (*m_currentCharacter != '/')
  808. Error(ERRnoSlash);
  809. m_currentCharacter++;
  810. tokens tk = tkNone;
  811. ThreadContext *threadContext = ThreadContext::GetContextForCurrentThread();
  812. threadContext->EnsureRecycler();
  813. Js::TempArenaAllocatorObject *alloc = threadContext->GetTemporaryAllocator(_u("RescanRegExp"));
  814. __try
  815. {
  816. tk = ScanRegExpConstantNoAST(alloc->GetAllocator());
  817. }
  818. __finally
  819. {
  820. threadContext->ReleaseTemporaryAllocator(alloc);
  821. }
  822. return tk;
  823. }
  824. template <typename EncodingPolicy>
  825. tokens Scanner<EncodingPolicy>::ScanRegExpConstant(ArenaAllocator* alloc)
  826. {
  827. if (m_parser && m_parser->IsBackgroundParser())
  828. {
  829. PROBE_STACK_NO_DISPOSE(m_scriptContext, Js::Constants::MinStackRegex);
  830. }
  831. else
  832. {
  833. PROBE_STACK(m_scriptContext, Js::Constants::MinStackRegex);
  834. }
  835. // SEE ALSO: RegexHelper::PrimCompileDynamic()
  836. #ifdef PROFILE_EXEC
  837. m_scriptContext->ProfileBegin(Js::RegexCompilePhase);
  838. #endif
  839. ArenaAllocator* ctAllocator = alloc;
  840. UnifiedRegex::StandardChars<EncodedChar>* standardEncodedChars = m_scriptContext->GetThreadContext()->GetStandardChars((EncodedChar*)0);
  841. UnifiedRegex::StandardChars<char16>* standardChars = m_scriptContext->GetThreadContext()->GetStandardChars((char16*)0);
  842. #if ENABLE_REGEX_CONFIG_OPTIONS
  843. UnifiedRegex::DebugWriter *w = 0;
  844. if (REGEX_CONFIG_FLAG(RegexDebug))
  845. w = m_scriptContext->GetRegexDebugWriter();
  846. if (REGEX_CONFIG_FLAG(RegexProfile))
  847. m_scriptContext->GetRegexStatsDatabase()->BeginProfile();
  848. #endif
  849. UnifiedRegex::Node* root = 0;
  850. charcount_t totalLen = 0, bodyChars = 0, totalChars = 0, bodyLen = 0;
  851. UnifiedRegex::RegexFlags flags = UnifiedRegex::NoRegexFlags;
  852. UnifiedRegex::Parser<EncodingPolicy, true> parser
  853. ( m_scriptContext
  854. , ctAllocator
  855. , standardEncodedChars
  856. , standardChars
  857. , IsFromExternalSource()
  858. #if ENABLE_REGEX_CONFIG_OPTIONS
  859. , w
  860. #endif
  861. );
  862. try
  863. {
  864. root = parser.ParseLiteral(m_currentCharacter, m_pchLast, bodyLen, totalLen, bodyChars, totalChars, flags);
  865. }
  866. catch (UnifiedRegex::ParseError e)
  867. {
  868. #ifdef PROFILE_EXEC
  869. m_scriptContext->ProfileEnd(Js::RegexCompilePhase);
  870. #endif
  871. if (m_fSyntaxColor)
  872. return ScanError(m_currentCharacter + e.encodedPos, tkRegExp);
  873. m_currentCharacter += e.encodedPos;
  874. Error(e.error);
  875. }
  876. UnifiedRegex::RegexPattern* pattern;
  877. if (m_parser->IsBackgroundParser())
  878. {
  879. // Avoid allocating pattern from recycler on background thread. The main thread will create the pattern
  880. // and hook it to this parse node.
  881. pattern = parser.CompileProgram<false>(root, m_currentCharacter, totalLen, bodyChars, totalChars, flags);
  882. }
  883. else
  884. {
  885. pattern = parser.CompileProgram<true>(root, m_currentCharacter, totalLen, bodyChars, totalChars, flags);
  886. }
  887. RestoreMultiUnits(m_cMultiUnits + parser.GetMultiUnits()); // m_currentCharacter changed, sync MultiUnits
  888. return m_ptoken->SetRegex(pattern, m_parser);
  889. }
  890. template<typename EncodingPolicy>
  891. tokens Scanner<EncodingPolicy>::ScanRegExpConstantNoAST(ArenaAllocator* alloc)
  892. {
  893. if (m_parser && m_parser->IsBackgroundParser())
  894. {
  895. PROBE_STACK_NO_DISPOSE(m_scriptContext, Js::Constants::MinStackRegex);
  896. }
  897. else
  898. {
  899. PROBE_STACK(m_scriptContext, Js::Constants::MinStackRegex);
  900. }
  901. ThreadContext *threadContext = m_fSyntaxColor ? ThreadContext::GetContextForCurrentThread() : m_scriptContext->GetThreadContext();
  902. UnifiedRegex::StandardChars<EncodedChar>* standardEncodedChars = threadContext->GetStandardChars((EncodedChar*)0);
  903. UnifiedRegex::StandardChars<char16>* standardChars = threadContext->GetStandardChars((char16*)0);
  904. charcount_t totalLen = 0, bodyChars = 0, totalChars = 0, bodyLen = 0;
  905. UnifiedRegex::Parser<EncodingPolicy, true> parser
  906. ( m_scriptContext
  907. , alloc
  908. , standardEncodedChars
  909. , standardChars
  910. , IsFromExternalSource()
  911. #if ENABLE_REGEX_CONFIG_OPTIONS
  912. , 0
  913. #endif
  914. );
  915. try
  916. {
  917. parser.ParseLiteralNoAST(m_currentCharacter, m_pchLast, bodyLen, totalLen, bodyChars, totalChars);
  918. }
  919. catch (UnifiedRegex::ParseError e)
  920. {
  921. if (m_fSyntaxColor)
  922. return ScanError(m_currentCharacter + e.encodedPos, tkRegExp);
  923. m_currentCharacter += e.encodedPos;
  924. Error(e.error);
  925. // never reached
  926. }
  927. UnifiedRegex::RegexPattern* pattern = parser.CompileProgram<false>(nullptr, m_currentCharacter, totalLen, bodyChars, totalChars, UnifiedRegex::NoRegexFlags);
  928. Assert(pattern == nullptr); // BuildAST == false, CompileProgram should return nullptr
  929. RestoreMultiUnits(m_cMultiUnits + parser.GetMultiUnits()); // m_currentCharacter changed, sync MultiUnits
  930. return (m_ptoken->tk = tkRegExp);
  931. }
  932. template<typename EncodingPolicy>
  933. tokens Scanner<EncodingPolicy>::ScanStringTemplateBegin(EncodedCharPtr *pp)
  934. {
  935. // String template must begin with a string constant followed by '`' or '${'
  936. ScanStringConstant<true, true>('`', pp);
  937. OLECHAR ch;
  938. EncodedCharPtr last = m_pchLast;
  939. ch = ReadFirst(*pp, last);
  940. if (ch == '`')
  941. {
  942. // Simple string template - no substitutions
  943. return tkStrTmplBasic;
  944. }
  945. else if (ch == '$')
  946. {
  947. ch = ReadFirst(*pp, last);
  948. if (ch == '{')
  949. {
  950. // Next token after expr should be tkStrTmplMid or tkStrTmplEnd.
  951. // In string template scanning mode, we expect the next char to be '}'
  952. // and will treat it as the beginning of tkStrTmplEnd or tkStrTmplMid
  953. m_fStringTemplateDepth++;
  954. // Regular string template begin - next is first substitution
  955. return tkStrTmplBegin;
  956. }
  957. }
  958. // Error - make sure pointer stays at the last character of the error token instead of after it in the error case
  959. (*pp)--;
  960. return ScanError(m_currentCharacter, tkStrTmplBegin);
  961. }
  962. template<typename EncodingPolicy>
  963. tokens Scanner<EncodingPolicy>::ScanStringTemplateMiddleOrEnd(EncodedCharPtr *pp)
  964. {
  965. // String template middle and end tokens must begin with a string constant
  966. ScanStringConstant<true, true>('`', pp);
  967. OLECHAR ch;
  968. EncodedCharPtr last = m_pchLast;
  969. ch = ReadFirst(*pp, last);
  970. if (ch == '`')
  971. {
  972. // No longer in string template scanning mode
  973. m_fStringTemplateDepth--;
  974. // This is the last part of the template ...`
  975. return tkStrTmplEnd;
  976. }
  977. else if (ch == '$')
  978. {
  979. ch = ReadFirst(*pp, last);
  980. if (ch == '{')
  981. {
  982. // This is just another middle part of the template }...${
  983. return tkStrTmplMid;
  984. }
  985. }
  986. // Error - make sure pointer stays at the last character of the error token instead of after it in the error case
  987. (*pp)--;
  988. return ScanError(m_currentCharacter, tkStrTmplEnd);
  989. }
  990. /*****************************************************************************
  991. *
  992. * Parses a string constant. Note that the string value is stored in
  993. * a volatile buffer (or allocated on the heap if too long), and thus
  994. * the string should be saved off before the next token is scanned.
  995. */
  996. template<typename EncodingPolicy>
  997. template<bool stringTemplateMode, bool createRawString>
  998. tokens Scanner<EncodingPolicy>::ScanStringConstant(OLECHAR delim, EncodedCharPtr *pp)
  999. {
  1000. static_assert((stringTemplateMode && createRawString) || (!stringTemplateMode && !createRawString), "stringTemplateMode and createRawString must have the same value");
  1001. OLECHAR ch, c, rawch;
  1002. int wT;
  1003. EncodedCharPtr p = *pp;
  1004. EncodedCharPtr last = m_pchLast;
  1005. // Reset
  1006. m_OctOrLeadingZeroOnLastTKNumber = false;
  1007. m_EscapeOnLastTkStrCon = FALSE;
  1008. m_tempChBuf.Init();
  1009. // Use template parameter to gate raw string creation.
  1010. // If createRawString is false, all these operations should be no-ops
  1011. if (createRawString)
  1012. {
  1013. m_tempChBufSecondary.Init();
  1014. }
  1015. for (;;)
  1016. {
  1017. switch ((rawch = ch = ReadFirst(p, last)))
  1018. {
  1019. case kchRET:
  1020. if (stringTemplateMode)
  1021. {
  1022. if (PeekFirst(p, last) == kchNWL)
  1023. {
  1024. // Eat the <LF> char, ignore return
  1025. ReadFirst(p, last);
  1026. }
  1027. // Both <CR> and <CR><LF> are normalized to <LF> in template cooked and raw values
  1028. ch = rawch = kchNWL;
  1029. }
  1030. LEcmaLineBreak:
  1031. // Fall through
  1032. case kchNWL:
  1033. if (stringTemplateMode)
  1034. {
  1035. // Notify the scanner to update current line, number of lines etc
  1036. NotifyScannedNewLine();
  1037. break;
  1038. }
  1039. m_currentCharacter = p - 1;
  1040. if (m_fSyntaxColor)
  1041. {
  1042. *pp = p - 1;
  1043. return ScanError(p - 1, tkStrCon);
  1044. }
  1045. Error(ERRnoStrEnd);
  1046. case '"':
  1047. case '\'':
  1048. if (ch == delim)
  1049. goto LBreak;
  1050. break;
  1051. case '`':
  1052. // In string template scan mode, don't consume the '`' - we need to differentiate
  1053. // between a closed string template and the expression open sequence - ${
  1054. if (stringTemplateMode)
  1055. {
  1056. p--;
  1057. goto LBreak;
  1058. }
  1059. // If we aren't scanning for a string template, do the default thing
  1060. goto LMainDefault;
  1061. case '$':
  1062. // If we are parsing a string literal part of a string template, ${ indicates we need to switch
  1063. // to parsing an expression.
  1064. if (stringTemplateMode && PeekFirst(p, last) == '{')
  1065. {
  1066. // Rewind to the $ and return
  1067. p--;
  1068. goto LBreak;
  1069. }
  1070. // If we aren't scanning for a string template, do the default thing
  1071. goto LMainDefault;
  1072. case kchNUL:
  1073. if (p >= last)
  1074. {
  1075. m_currentCharacter = p - 1;
  1076. if (m_fSyntaxColor)
  1077. {
  1078. *pp = p - 1;
  1079. return ScanError(p - 1, tkStrCon);
  1080. }
  1081. Error(ERRnoStrEnd);
  1082. }
  1083. break;
  1084. default:
  1085. LMainDefault:
  1086. if (IsMultiUnitChar(ch))
  1087. {
  1088. if ((ch == kchLS || ch == kchPS))
  1089. {
  1090. goto LEcmaLineBreak;
  1091. }
  1092. rawch = ch = ReadRest<true>(ch, p, last);
  1093. switch (ch)
  1094. {
  1095. case kchLS: // 0x2028, classifies as new line
  1096. case kchPS: // 0x2029, classifies as new line
  1097. goto LEcmaLineBreak;
  1098. }
  1099. }
  1100. break;
  1101. case kchBSL:
  1102. // In raw mode '\\' is not an escape character, just add the char into the raw buffer.
  1103. m_tempChBufSecondary.AppendCh<createRawString>(ch);
  1104. m_EscapeOnLastTkStrCon=TRUE;
  1105. // In raw mode, we append the raw char itself and not the escaped value so save the char.
  1106. rawch = ch = ReadFirst(p, last);
  1107. codepoint_t codePoint = 0;
  1108. uint errorType = (uint)ERRbadHexDigit;
  1109. switch (ch)
  1110. {
  1111. case 'b':
  1112. ch = 0x08;
  1113. break;
  1114. case 't':
  1115. ch = 0x09;
  1116. break;
  1117. case 'v':
  1118. ch = 0x0B; //Only in ES5 mode
  1119. break; //same as default
  1120. case 'n':
  1121. ch = 0x0A;
  1122. break;
  1123. case 'f':
  1124. ch = 0x0C;
  1125. break;
  1126. case 'r':
  1127. ch = 0x0D;
  1128. break;
  1129. case 'x':
  1130. // Insert the 'x' here before jumping to parse the hex digits.
  1131. m_tempChBufSecondary.AppendCh<createRawString>(ch);
  1132. // 2 hex digits
  1133. ch = 0;
  1134. goto LTwoHex;
  1135. case 'u':
  1136. // Raw string just inserts a 'u' here.
  1137. m_tempChBufSecondary.AppendCh<createRawString>(ch);
  1138. ch = 0;
  1139. if (Js::NumberUtilities::FHexDigit(c = ReadFirst(p, last), &wT))
  1140. goto LFourHex;
  1141. else if (c != '{' || !this->es6UnicodeMode)
  1142. goto ReturnScanError;
  1143. Assert(c == '{');
  1144. // c should definitely be a '{' which should be appended to the raw string.
  1145. m_tempChBufSecondary.AppendCh<createRawString>(c);
  1146. //At least one digit is expected
  1147. if (!Js::NumberUtilities::FHexDigit(c = ReadFirst(p, last), &wT))
  1148. {
  1149. goto ReturnScanError;
  1150. }
  1151. m_tempChBufSecondary.AppendCh<createRawString>(c);
  1152. codePoint = static_cast<codepoint_t>(wT);
  1153. while(Js::NumberUtilities::FHexDigit(c = ReadFirst(p, last), &wT))
  1154. {
  1155. m_tempChBufSecondary.AppendCh<createRawString>(c);
  1156. codePoint <<= 4;
  1157. codePoint += static_cast<codepoint_t>(wT);
  1158. if (codePoint > 0x10FFFF)
  1159. {
  1160. errorType = (uint)ERRInvalidCodePoint;
  1161. goto ReturnScanError;
  1162. }
  1163. }
  1164. if (c != '}')
  1165. {
  1166. errorType = (uint)ERRMissingCurlyBrace;
  1167. goto ReturnScanError;
  1168. }
  1169. Assert(codePoint <= 0x10FFFF);
  1170. if (codePoint >= 0x10000)
  1171. {
  1172. OLECHAR lower = 0;
  1173. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &lower, &ch);
  1174. m_tempChBuf.AppendCh(lower);
  1175. }
  1176. else
  1177. {
  1178. ch = (char16)codePoint;
  1179. }
  1180. // In raw mode we want the last hex character or the closing curly. c should hold one or the other.
  1181. if (createRawString)
  1182. rawch = c;
  1183. break;
  1184. LFourHex:
  1185. codePoint = 0x0;
  1186. // Append first hex digit character to the raw string.
  1187. m_tempChBufSecondary.AppendCh<createRawString>(c);
  1188. codePoint += static_cast<codepoint_t>(wT * 0x1000);
  1189. if (!Js::NumberUtilities::FHexDigit(c = ReadFirst(p, last), &wT))
  1190. goto ReturnScanError;
  1191. // Append fourth (or second) hex digit character to the raw string.
  1192. m_tempChBufSecondary.AppendCh<createRawString>(c);
  1193. codePoint += static_cast<codepoint_t>(wT * 0x0100);
  1194. LTwoHex:
  1195. // This code path doesn't expect curly.
  1196. if (!Js::NumberUtilities::FHexDigit(c = ReadFirst(p, last), &wT))
  1197. goto ReturnScanError;
  1198. // Append first hex digit character to the raw string.
  1199. m_tempChBufSecondary.AppendCh<createRawString>(c);
  1200. codePoint += static_cast<codepoint_t>(wT * 0x0010);
  1201. if (!Js::NumberUtilities::FHexDigit(c = ReadFirst(p, last), &wT))
  1202. goto ReturnScanError;
  1203. codePoint += static_cast<codepoint_t>(wT);
  1204. // In raw mode we want the last hex character or the closing curly. c should hold one or the other.
  1205. if (createRawString)
  1206. rawch = c;
  1207. if (codePoint < 0x10000)
  1208. {
  1209. ch = static_cast<OLECHAR>(codePoint);
  1210. }
  1211. else
  1212. {
  1213. goto ReturnScanError;
  1214. }
  1215. break;
  1216. case '0':
  1217. case '1':
  1218. case '2':
  1219. case '3':
  1220. // 1 to 3 octal digits
  1221. ch -= '0';
  1222. // Octal escape sequences are not allowed inside string template literals
  1223. if (stringTemplateMode)
  1224. {
  1225. c = PeekFirst(p, last);
  1226. if (ch != 0 || (c >= '0' && c <= '7'))
  1227. {
  1228. errorType = (uint)ERRES5NoOctal;
  1229. goto ReturnScanError;
  1230. }
  1231. break;
  1232. }
  1233. wT = (c = ReadFirst(p, last)) - '0';
  1234. if ((char16)wT > 7)
  1235. {
  1236. if (ch != 0 || ((char16)wT <= 9))
  1237. {
  1238. m_OctOrLeadingZeroOnLastTKNumber = true;
  1239. }
  1240. p--;
  1241. break;
  1242. }
  1243. m_OctOrLeadingZeroOnLastTKNumber = true;
  1244. ch = static_cast< OLECHAR >(ch * 8 + wT);
  1245. goto LOneOctal;
  1246. case '4':
  1247. case '5':
  1248. case '6':
  1249. case '7':
  1250. // 1 to 2 octal digits
  1251. // Octal escape sequences are not allowed inside string template literals
  1252. if (stringTemplateMode)
  1253. {
  1254. errorType = (uint)ERRES5NoOctal;
  1255. goto ReturnScanError;
  1256. }
  1257. ch -= '0';
  1258. m_OctOrLeadingZeroOnLastTKNumber = true;
  1259. LOneOctal:
  1260. wT = (c = ReadFirst(p, last)) - '0';
  1261. if ((char16)wT > 7)
  1262. {
  1263. p--;
  1264. break;
  1265. }
  1266. ch = static_cast< OLECHAR >(ch * 8 + wT);
  1267. break;
  1268. case kchRET: // 0xD
  1269. if (stringTemplateMode)
  1270. {
  1271. // If this is \<CR><LF> we can eat the <LF> right now
  1272. if (PeekFirst(p, last) == kchNWL)
  1273. {
  1274. // Eat the <LF> char, ignore return
  1275. ReadFirst(p, last);
  1276. }
  1277. // Both \<CR> and \<CR><LF> are normalized to \<LF> in template raw string
  1278. rawch = kchNWL;
  1279. }
  1280. case kchLS: // 0x2028, classifies as new line
  1281. case kchPS: // 0x2029, classifies as new line
  1282. case kchNWL: // 0xA
  1283. LEcmaEscapeLineBreak:
  1284. if (stringTemplateMode)
  1285. {
  1286. // We're going to ignore the line continuation tokens for the cooked strings, but we need to append the token for raw strings
  1287. m_tempChBufSecondary.AppendCh<createRawString>(rawch);
  1288. // Template literal strings ignore all escaped line continuation tokens
  1289. NotifyScannedNewLine();
  1290. continue;
  1291. }
  1292. m_currentCharacter = p;
  1293. ScanNewLine(ch);
  1294. p = m_currentCharacter;
  1295. if (m_fSyntaxColor && *p == 0)
  1296. {
  1297. // Special case for multi-line strings during colorization.
  1298. m_scanState = delim == '"' ? ScanStateMultiLineDoubleQuoteString : ScanStateMultiLineSingleQuoteString;
  1299. *pp = p;
  1300. return tkStrCon;
  1301. }
  1302. continue;
  1303. case 0:
  1304. if (p >= last)
  1305. {
  1306. errorType = (uint)ERRnoStrEnd;
  1307. ReturnScanError:
  1308. m_currentCharacter = p - 1;
  1309. if (m_fSyntaxColor)
  1310. {
  1311. *pp = p - 1;
  1312. return ScanError(p - 1, tkStrCon);
  1313. }
  1314. Error(errorType);
  1315. }
  1316. else if (stringTemplateMode)
  1317. {
  1318. // Escaped null character is translated into 0x0030 for raw template literals
  1319. rawch = 0x0030;
  1320. }
  1321. break;
  1322. default:
  1323. if (IsMultiUnitChar(ch))
  1324. {
  1325. rawch = ch = ReadRest<true>(ch, p, last);
  1326. switch (ch)
  1327. {
  1328. case kchLS:
  1329. case kchPS:
  1330. goto LEcmaEscapeLineBreak;
  1331. }
  1332. }
  1333. break;
  1334. }
  1335. break;
  1336. }
  1337. m_tempChBuf.AppendCh(ch);
  1338. m_tempChBufSecondary.AppendCh<createRawString>(rawch);
  1339. }
  1340. LBreak:
  1341. bool createPid = true;
  1342. if (m_fSyntaxColor || (m_DeferredParseFlags & ScanFlagSuppressStrPid) != 0)
  1343. {
  1344. createPid = false;
  1345. if ((m_tempChBuf.m_ichCur == 10) && (0 == memcmp(_u("use strict"), m_tempChBuf.m_prgch, m_tempChBuf.m_ichCur * sizeof(OLECHAR))))
  1346. {
  1347. createPid = true;
  1348. }
  1349. }
  1350. if (createPid)
  1351. {
  1352. m_ptoken->SetIdentifier(m_phtbl->PidHashNameLen(m_tempChBuf.m_prgch, m_tempChBuf.m_ichCur));
  1353. }
  1354. else
  1355. {
  1356. m_ptoken->SetIdentifier(NULL);
  1357. }
  1358. m_scanState = ScanStateNormal;
  1359. m_doubleQuoteOnLastTkStrCon = '"' == delim;
  1360. *pp = p;
  1361. return tkStrCon;
  1362. }
  1363. template<typename EncodingPolicy>
  1364. tokens Scanner<EncodingPolicy>::ScanStringConstant(OLECHAR delim, EncodedCharPtr *pp)
  1365. {
  1366. return ScanStringConstant<false, false>(delim, pp);
  1367. }
  1368. /*****************************************************************************
  1369. *
  1370. * Consume a C-style comment.
  1371. */
  1372. template<typename EncodingPolicy>
  1373. tokens Scanner<EncodingPolicy>::SkipComment(EncodedCharPtr *pp, /* out */ bool* containTypeDef)
  1374. {
  1375. Assert(containTypeDef != nullptr);
  1376. EncodedCharPtr p = *pp;
  1377. *containTypeDef = false;
  1378. EncodedCharPtr last = m_pchLast;
  1379. OLECHAR ch;
  1380. for (;;)
  1381. {
  1382. switch((ch = ReadFirst(p, last)))
  1383. {
  1384. case '*':
  1385. if (*p == '/')
  1386. {
  1387. *pp = p + 1;
  1388. if (m_fSyntaxColor)
  1389. {
  1390. m_scanState = ScanStateNormal;
  1391. return tkComment;
  1392. }
  1393. return tkNone;
  1394. }
  1395. break;
  1396. // ES 2015 11.3 Line Terminators
  1397. case kchLS: // 0x2028, classifies as new line
  1398. case kchPS: // 0x2029, classifies as new line
  1399. LEcmaLineBreak:
  1400. goto LLineBreak;
  1401. case kchRET:
  1402. case kchNWL:
  1403. LLineBreak:
  1404. m_fHadEol = TRUE;
  1405. m_currentCharacter = p;
  1406. ScanNewLine(ch);
  1407. p = m_currentCharacter;
  1408. break;
  1409. case kchNUL:
  1410. if (p >= last)
  1411. {
  1412. m_currentCharacter = p - 1;
  1413. *pp = p - 1;
  1414. if (m_fSyntaxColor)
  1415. {
  1416. m_scanState = ScanStateMultiLineComment;
  1417. return tkComment;
  1418. }
  1419. Error(ERRnoCmtEnd);
  1420. }
  1421. break;
  1422. default:
  1423. if (IsMultiUnitChar(ch))
  1424. {
  1425. ch = ReadRest<true>(ch, p, last);
  1426. switch (ch)
  1427. {
  1428. case kchLS:
  1429. case kchPS:
  1430. goto LEcmaLineBreak;
  1431. }
  1432. }
  1433. break;
  1434. }
  1435. }
  1436. }
  1437. /*****************************************************************************
  1438. *
  1439. * We've encountered a newline - update various counters and things.
  1440. */
  1441. template<typename EncodingPolicy>
  1442. void Scanner<EncodingPolicy>::ScanNewLine(uint ch)
  1443. {
  1444. if (ch == '\r' && PeekNextChar() == '\n')
  1445. {
  1446. ReadNextChar();
  1447. }
  1448. NotifyScannedNewLine();
  1449. }
  1450. /*****************************************************************************
  1451. *
  1452. * We've encountered a newline - update various counters and things.
  1453. */
  1454. template<typename EncodingPolicy>
  1455. void Scanner<EncodingPolicy>::NotifyScannedNewLine()
  1456. {
  1457. // update in scanner: previous line, current line, number of lines.
  1458. m_line++;
  1459. m_pchPrevLine = m_pchMinLine;
  1460. m_pchMinLine = m_currentCharacter;
  1461. m_cMinLineMultiUnits = m_cMultiUnits;
  1462. }
  1463. /*****************************************************************************
  1464. *
  1465. * Delivers a token stream.
  1466. */
  1467. template<typename EncodingPolicy>
  1468. tokens Scanner<EncodingPolicy>::ScanForcingPid()
  1469. {
  1470. if (m_DeferredParseFlags != ScanFlagNone)
  1471. {
  1472. BYTE deferredParseFlagsSave = m_DeferredParseFlags;
  1473. m_DeferredParseFlags = ScanFlagNone;
  1474. tokens result = tkEOF;
  1475. __try
  1476. {
  1477. result = Scan();
  1478. }
  1479. __finally
  1480. {
  1481. m_DeferredParseFlags = deferredParseFlagsSave;
  1482. }
  1483. return result;
  1484. }
  1485. return Scan();
  1486. }
  1487. template<typename EncodingPolicy>
  1488. tokens Scanner<EncodingPolicy>::Scan()
  1489. {
  1490. return ScanCore(true);
  1491. }
  1492. template<typename EncodingPolicy>
  1493. tokens Scanner<EncodingPolicy>::ScanNoKeywords()
  1494. {
  1495. return ScanCore(false);
  1496. }
  1497. template<typename EncodingPolicy>
  1498. tokens Scanner<EncodingPolicy>::ScanAhead()
  1499. {
  1500. return ScanNoKeywords();
  1501. }
  1502. template<typename EncodingPolicy>
  1503. tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
  1504. {
  1505. codepoint_t ch;
  1506. OLECHAR firstChar;
  1507. OLECHAR secondChar;
  1508. EncodedCharPtr pchT;
  1509. size_t multiUnits = 0;
  1510. EncodedCharPtr p = m_currentCharacter;
  1511. EncodedCharPtr last = m_pchLast;
  1512. // store the last token
  1513. m_tkPrevious = m_ptoken->tk;
  1514. m_iecpLimTokPrevious = IecpLimTok(); // Introduced for use by lambda parsing to find correct span of expression lambdas
  1515. if (p >= last)
  1516. {
  1517. m_pchMinTok = p;
  1518. m_cMinTokMultiUnits = m_cMultiUnits;
  1519. goto LEof;
  1520. }
  1521. tokens token;
  1522. m_fHadEol = FALSE;
  1523. CharTypes chType;
  1524. charcount_t commentStartLine;
  1525. bool seenDelimitedCommentEnd = false;
  1526. if (m_scanState && *p != 0)
  1527. {
  1528. if (m_fSyntaxColor)
  1529. {
  1530. firstChar = 0;
  1531. secondChar = 0;
  1532. m_pchMinTok = p;
  1533. m_cMinTokMultiUnits = m_cMultiUnits;
  1534. switch (m_scanState)
  1535. {
  1536. case ScanStateMultiLineComment:
  1537. goto LMultiLineComment;
  1538. case ScanStateMultiLineSingleQuoteString:
  1539. ch = '\'';
  1540. m_scanState = ScanStateNormal;
  1541. goto LScanStringConstant;
  1542. case ScanStateMultiLineDoubleQuoteString:
  1543. ch = '"';
  1544. m_scanState = ScanStateNormal;
  1545. goto LScanStringConstant;
  1546. }
  1547. }
  1548. if (m_scanState == ScanStateStringTemplateMiddleOrEnd)
  1549. {
  1550. AssertMsg(m_fStringTemplateDepth > 0,
  1551. "Shouldn't be trying to parse a string template end or middle token if we aren't scanning a string template");
  1552. m_scanState = ScanStateNormal;
  1553. pchT = p;
  1554. token = ScanStringTemplateMiddleOrEnd(&pchT);
  1555. p = pchT;
  1556. goto LDone;
  1557. }
  1558. }
  1559. for (;;)
  1560. {
  1561. LLoop:
  1562. m_pchMinTok = p;
  1563. m_cMinTokMultiUnits = m_cMultiUnits;
  1564. ch = ReadFirst(p, last);
  1565. #if DEBUG
  1566. chType = this->charClassifier->GetCharType((OLECHAR)ch);
  1567. #endif
  1568. switch (ch)
  1569. {
  1570. default:
  1571. if (ch == kchLS ||
  1572. ch == kchPS )
  1573. {
  1574. goto LNewLine;
  1575. }
  1576. {
  1577. BOOL isMultiUnit = IsMultiUnitChar((OLECHAR)ch);
  1578. if (isMultiUnit)
  1579. {
  1580. ch = ReadRest<true>((OLECHAR)ch, p, last);
  1581. }
  1582. if (es6UnicodeMode && Js::NumberUtilities::IsSurrogateLowerPart(ch))
  1583. {
  1584. codepoint_t upper = PeekFull(p, last);
  1585. if (Js::NumberUtilities::IsSurrogateUpperPart(upper))
  1586. {
  1587. ch = Js::NumberUtilities::SurrogatePairAsCodePoint(ch, upper);
  1588. ReadFull<true>(p, last);
  1589. }
  1590. }
  1591. if (this->charClassifier->IsIdStart(ch))
  1592. {
  1593. // We treat IDContinue as an error.
  1594. token = ScanIdentifierContinue(identifyKwds, false, !!isMultiUnit, m_pchMinTok, p, &p);
  1595. break;
  1596. }
  1597. }
  1598. chType = this->charClassifier->GetCharType(ch);
  1599. switch (chType)
  1600. {
  1601. case _C_WSP: continue;
  1602. case _C_NWL: goto LNewLine;
  1603. // All other types (except errors) are handled by the outer switch.
  1604. }
  1605. Assert(chType == _C_LET || chType == _C_ERR || chType == _C_UNK || chType == _C_BKQ || chType == _C_SHP || chType == _C_AT || chType == _C_DIG);
  1606. if (m_fSyntaxColor)
  1607. {
  1608. // No need to decrement the current position pointer as scanner will continue with scan next character onwards
  1609. return ScanError(p, tkID);
  1610. }
  1611. m_currentCharacter = p - 1;
  1612. Error(ERRillegalChar);
  1613. continue;
  1614. case '\0':
  1615. // Put back the null in case we get called again.
  1616. p--;
  1617. LEof:
  1618. token = tkEOF;
  1619. if (p + 1 < last)
  1620. {
  1621. if (m_fSyntaxColor)
  1622. {
  1623. return ScanError(p + 1, tkID);
  1624. }
  1625. // A \0 prior to the end of the text is an invalid character.
  1626. Error(ERRillegalChar);
  1627. }
  1628. break;
  1629. case 0x0009:
  1630. case 0x000B:
  1631. case 0x000C:
  1632. case 0x0020:
  1633. Assert(chType == _C_WSP);
  1634. continue;
  1635. case '.':
  1636. if (!Js::NumberUtilities::IsDigit(*p))
  1637. {
  1638. // Not a double
  1639. if (m_scriptContext->GetConfig()->IsES6SpreadEnabled() && PeekFirst(p, last) == '.' && PeekFirst(p + 1, last) == '.')
  1640. {
  1641. token = tkEllipsis;
  1642. p += 2;
  1643. }
  1644. else
  1645. {
  1646. token = tkDot;
  1647. }
  1648. break;
  1649. }
  1650. // May be a double, fall through
  1651. case '0': case '1': case '2': case '3': case '4':
  1652. case '5': case '6': case '7': case '8': case '9':
  1653. {
  1654. double dbl;
  1655. Assert(chType == _C_DIG || chType == _C_DOT);
  1656. p = m_pchMinTok;
  1657. RestoreMultiUnits(m_cMinTokMultiUnits);
  1658. bool likelyInt = true;
  1659. pchT = FScanNumber(p, &dbl, likelyInt);
  1660. if (p == pchT)
  1661. {
  1662. Assert(PeekFirst(p, last) != '.');
  1663. if (m_fSyntaxColor)
  1664. {
  1665. return ScanError(m_currentCharacter + 1, tkFltCon);
  1666. }
  1667. Error(ERRbadNumber);
  1668. }
  1669. Assert(!Js::NumberUtilities::IsNan(dbl));
  1670. p = pchT;
  1671. long value;
  1672. if (likelyInt && Js::NumberUtilities::FDblIsLong(dbl, &value))
  1673. {
  1674. m_ptoken->SetLong(value);
  1675. token = tkIntCon;
  1676. }
  1677. else
  1678. {
  1679. token = tkFltCon;
  1680. m_ptoken->SetDouble(dbl, likelyInt);
  1681. }
  1682. break;
  1683. }
  1684. case '(': Assert(chType == _C_LPR); token = tkLParen; break;
  1685. case ')': Assert(chType == _C_RPR); token = tkRParen; break;
  1686. case ',': Assert(chType == _C_CMA); token = tkComma; break;
  1687. case ';': Assert(chType == _C_SMC); token = tkSColon; break;
  1688. case '[': Assert(chType == _C_LBR); token = tkLBrack; break;
  1689. case ']': Assert(chType == _C_RBR); token = tkRBrack; break;
  1690. case '~': Assert(chType == _C_TIL); token = tkTilde; break;
  1691. case '?': Assert(chType == _C_QUE); token = tkQMark; break;
  1692. case '{': Assert(chType == _C_LC); token = tkLCurly; break;
  1693. // ES 2015 11.3 Line Terminators
  1694. case '\r':
  1695. case '\n':
  1696. // kchLS:
  1697. // kchPS:
  1698. LNewLine:
  1699. m_currentCharacter = p;
  1700. ScanNewLine(ch);
  1701. p = m_currentCharacter;
  1702. m_fHadEol = TRUE;
  1703. continue;
  1704. LReserved:
  1705. {
  1706. // We will derive the PID from the token
  1707. Assert(token < tkID);
  1708. m_ptoken->SetIdentifier(NULL);
  1709. goto LDone;
  1710. }
  1711. LEval:
  1712. {
  1713. token = tkID;
  1714. if (!this->m_parser) goto LIdentifier;
  1715. m_ptoken->SetIdentifier(this->m_parser->GetEvalPid());
  1716. goto LDone;
  1717. }
  1718. LArguments:
  1719. {
  1720. token = tkID;
  1721. if (!this->m_parser) goto LIdentifier;
  1722. m_ptoken->SetIdentifier(this->m_parser->GetArgumentsPid());
  1723. goto LDone;
  1724. }
  1725. LTarget:
  1726. {
  1727. token = tkID;
  1728. if (!this->m_parser) goto LIdentifier;
  1729. m_ptoken->SetIdentifier(this->m_parser->GetTargetPid());
  1730. goto LDone;
  1731. }
  1732. #include "kwd-swtch.h"
  1733. case 'A': case 'B': case 'C': case 'D': case 'E':
  1734. case 'F': case 'G': case 'H': case 'I': case 'J':
  1735. case 'K': case 'L': case 'M': case 'N': case 'O':
  1736. case 'P': case 'Q': case 'R': case 'S': case 'T':
  1737. case 'U': case 'V': case 'W': case 'X': case 'Y':
  1738. case 'Z':
  1739. // Lower-case letters handled in kwd-swtch.h above during reserved word recognition.
  1740. case '$': case '_':
  1741. LIdentifier:
  1742. Assert(this->charClassifier->IsIdStart(ch));
  1743. Assert(ch < 0x10000 && !IsMultiUnitChar((OLECHAR)ch));
  1744. token = ScanIdentifierContinue(identifyKwds, false, false, m_pchMinTok, p, &p);
  1745. break;
  1746. case '`':
  1747. Assert(chType == _C_BKQ);
  1748. pchT = p;
  1749. token = ScanStringTemplateBegin(&pchT);
  1750. p = pchT;
  1751. break;
  1752. case '}':
  1753. Assert(chType == _C_RC);
  1754. token = tkRCurly;
  1755. break;
  1756. case '\\':
  1757. pchT = p - 1;
  1758. token = ScanIdentifier(identifyKwds, &pchT);
  1759. if (tkScanError == token)
  1760. {
  1761. m_currentCharacter = p;
  1762. if (m_fSyntaxColor)
  1763. return ScanError(p, tkID);
  1764. Error(ERRillegalChar);
  1765. }
  1766. p = pchT;
  1767. break;
  1768. case ':':
  1769. token = tkColon;
  1770. break;
  1771. case '=':
  1772. token = tkAsg;
  1773. switch (PeekFirst(p, last))
  1774. {
  1775. case '=':
  1776. p++;
  1777. token = tkEQ;
  1778. if (PeekFirst(p, last) == '=')
  1779. {
  1780. p++;
  1781. token = tkEqv;
  1782. }
  1783. break;
  1784. case '>':
  1785. p++;
  1786. token = tkDArrow;
  1787. break;
  1788. }
  1789. break;
  1790. case '!':
  1791. token = tkBang;
  1792. if (PeekFirst(p, last) == '=')
  1793. {
  1794. p++;
  1795. token = tkNE;
  1796. if (PeekFirst(p, last) == '=')
  1797. {
  1798. p++;
  1799. token = tkNEqv;
  1800. }
  1801. }
  1802. break;
  1803. case '+':
  1804. token = tkAdd;
  1805. switch (PeekFirst(p, last))
  1806. {
  1807. case '=':
  1808. p++;
  1809. token = tkAsgAdd;
  1810. break;
  1811. case '+':
  1812. p++;
  1813. token = tkInc;
  1814. break;
  1815. }
  1816. break;
  1817. case '-':
  1818. token = tkSub;
  1819. switch (PeekFirst(p, last))
  1820. {
  1821. case '=':
  1822. p++;
  1823. token = tkAsgSub;
  1824. break;
  1825. case '-':
  1826. p++;
  1827. token = tkDec;
  1828. if (!m_fIsModuleCode)
  1829. {
  1830. if ('>' == PeekFirst(p, last) && (m_fHadEol || seenDelimitedCommentEnd)) // --> HTMLCloseComment
  1831. {
  1832. goto LSkipLineComment;
  1833. }
  1834. }
  1835. break;
  1836. }
  1837. break;
  1838. case '*':
  1839. token = tkStar;
  1840. switch(PeekFirst(p, last))
  1841. {
  1842. case '=' :
  1843. p++;
  1844. token = tkAsgMul;
  1845. break;
  1846. case '*' :
  1847. if (!m_scriptContext->GetConfig()->IsES7ExponentiationOperatorEnabled())
  1848. {
  1849. break;
  1850. }
  1851. p++;
  1852. token = tkExpo;
  1853. if (PeekFirst(p, last) == '=')
  1854. {
  1855. p++;
  1856. token = tkAsgExpo;
  1857. }
  1858. }
  1859. break;
  1860. case '/':
  1861. token = tkDiv;
  1862. switch(PeekFirst(p, last))
  1863. {
  1864. case '=':
  1865. p++;
  1866. token = tkAsgDiv;
  1867. break;
  1868. case '/':
  1869. if (p >= last)
  1870. {
  1871. AssertMsg(!m_fIsModuleCode, "Do we have other line comment cases scanning pass last?");
  1872. // Effective source length may have excluded HTMLCommentSuffix "//... -->". If we are scanning
  1873. // those, we have passed "last" already. Move back and return EOF.
  1874. p = last;
  1875. goto LEof;
  1876. }
  1877. ch = *++p;
  1878. firstChar = (OLECHAR)ch;
  1879. LSkipLineComment:
  1880. pchT = NULL;
  1881. for (;;)
  1882. {
  1883. switch ((ch = ReadFirst(p, last)))
  1884. {
  1885. case kchLS: // 0x2028, classifies as new line
  1886. case kchPS: // 0x2029, classifies as new line
  1887. LEcmaCommentLineBreak:
  1888. // kchPS and kchLS are more than one unit in UTF-8.
  1889. if (pchT)
  1890. {
  1891. // kchPS and kchLS are more than one unit in UTF-8.
  1892. p = pchT;
  1893. }
  1894. else
  1895. {
  1896. // But only a single code unit in UTF16
  1897. p--;
  1898. }
  1899. RestoreMultiUnits(multiUnits);
  1900. goto LCommentLineBreak;
  1901. case kchNWL:
  1902. case kchRET:
  1903. p--;
  1904. LCommentLineBreak:
  1905. if (m_fSyntaxColor)
  1906. {
  1907. token = tkComment;
  1908. goto LDone;
  1909. }
  1910. // Subtract the comment length from the total char count for the purpose
  1911. // of deciding whether to defer AST and byte code generation.
  1912. m_parser->ReduceDeferredScriptLength((ULONG)(p - m_pchMinTok));
  1913. break;
  1914. case kchNUL:
  1915. if (p >= last)
  1916. {
  1917. p--;
  1918. goto LCommentLineBreak;
  1919. }
  1920. continue;
  1921. default:
  1922. if (IsMultiUnitChar((OLECHAR)ch))
  1923. {
  1924. pchT = p - 1;
  1925. multiUnits = m_cMultiUnits;
  1926. switch (ch = ReadRest<true>((OLECHAR)ch, p, last))
  1927. {
  1928. case kchLS:
  1929. case kchPS:
  1930. goto LEcmaCommentLineBreak;
  1931. }
  1932. }
  1933. continue;
  1934. }
  1935. break;
  1936. }
  1937. continue;
  1938. case '*':
  1939. ch = *++p;
  1940. firstChar = (OLECHAR)ch;
  1941. if ((p + 1) < last)
  1942. {
  1943. secondChar = (OLECHAR)(*(p + 1));
  1944. }
  1945. else
  1946. {
  1947. secondChar = '\0';
  1948. }
  1949. LMultiLineComment:
  1950. pchT = p;
  1951. commentStartLine = m_line;
  1952. bool containTypeDef;
  1953. if (tkNone == (token = SkipComment(&pchT, &containTypeDef)))
  1954. {
  1955. // Subtract the comment length from the total char count for the purpose
  1956. // of deciding whether to defer AST and byte code generation.
  1957. m_parser->ReduceDeferredScriptLength((ULONG)(pchT - m_pchMinTok));
  1958. p = pchT;
  1959. seenDelimitedCommentEnd = true;
  1960. goto LLoop;
  1961. }
  1962. p = pchT;
  1963. break;
  1964. }
  1965. break;
  1966. case '%':
  1967. Assert(chType == _C_PCT);
  1968. token = tkPct;
  1969. if (PeekFirst(p, last) == '=')
  1970. {
  1971. p++;
  1972. token = tkAsgMod;
  1973. }
  1974. break;
  1975. case '<':
  1976. Assert(chType == _C_LT);
  1977. token = tkLT;
  1978. switch (PeekFirst(p, last))
  1979. {
  1980. case '=':
  1981. p++;
  1982. token = tkLE;
  1983. break;
  1984. case '<':
  1985. p++;
  1986. token = tkLsh;
  1987. if (PeekFirst(p, last) == '=')
  1988. {
  1989. p++;
  1990. token = tkAsgLsh;
  1991. break;
  1992. }
  1993. break;
  1994. case '!':
  1995. // ES 2015 B.1.3 - HTML comments are only allowed when parsing non-module code.
  1996. if (!m_fIsModuleCode && PeekFirst(p + 1, last) == '-' && PeekFirst(p + 2, last) == '-')
  1997. {
  1998. // This is a "<!--" comment - treat as //
  1999. if (p >= last)
  2000. {
  2001. // Effective source length may have excluded HTMLCommentSuffix "<!-- ... -->". If we are scanning
  2002. // those, we have passed "last" already. Move back and return EOF.
  2003. p = last;
  2004. goto LEof;
  2005. }
  2006. firstChar = '!';
  2007. goto LSkipLineComment;
  2008. }
  2009. break;
  2010. }
  2011. break;
  2012. case '>':
  2013. Assert(chType == _C_GT);
  2014. token = tkGT;
  2015. switch (PeekFirst(p, last))
  2016. {
  2017. case '=':
  2018. p++;
  2019. token = tkGE;
  2020. break;
  2021. case '>':
  2022. p++;
  2023. token = tkRsh;
  2024. switch (PeekFirst(p, last))
  2025. {
  2026. case '=':
  2027. p++;
  2028. token = tkAsgRsh;
  2029. break;
  2030. case '>':
  2031. p++;
  2032. token = tkRs2;
  2033. if (*p == '=')
  2034. {
  2035. p++;
  2036. token = tkAsgRs2;
  2037. }
  2038. break;
  2039. }
  2040. break;
  2041. }
  2042. break;
  2043. case '^':
  2044. Assert(chType == _C_XOR);
  2045. token = tkXor;
  2046. if (PeekFirst(p, last) == '=')
  2047. {
  2048. p++;
  2049. token = tkAsgXor;
  2050. }
  2051. break;
  2052. case '|':
  2053. Assert(chType == _C_BAR);
  2054. token = tkOr;
  2055. switch (PeekFirst(p, last))
  2056. {
  2057. case '=':
  2058. p++;
  2059. token = tkAsgOr;
  2060. break;
  2061. case '|':
  2062. p++;
  2063. token = tkLogOr;
  2064. break;
  2065. }
  2066. break;
  2067. case '&':
  2068. Assert(chType == _C_AMP);
  2069. token = tkAnd;
  2070. switch (PeekFirst(p, last))
  2071. {
  2072. case '=':
  2073. p++;
  2074. token = tkAsgAnd;
  2075. break;
  2076. case '&':
  2077. p++;
  2078. token = tkLogAnd;
  2079. break;
  2080. }
  2081. break;
  2082. case '\'':
  2083. case '"':
  2084. Assert(chType == _C_QUO || chType == _C_APO);
  2085. LScanStringConstant:
  2086. pchT = p;
  2087. token = ScanStringConstant((OLECHAR)ch, &pchT);
  2088. p = pchT;
  2089. break;
  2090. }
  2091. break;
  2092. }
  2093. LDone:
  2094. m_currentCharacter = p;
  2095. return (m_ptoken->tk = token);
  2096. }
  2097. template <typename EncodingPolicy>
  2098. IdentPtr Scanner<EncodingPolicy>::GetSecondaryBufferAsPid()
  2099. {
  2100. bool createPid = true;
  2101. if (m_fSyntaxColor || (m_DeferredParseFlags & ScanFlagSuppressStrPid) != 0)
  2102. {
  2103. createPid = false;
  2104. }
  2105. if (createPid)
  2106. {
  2107. return m_phtbl->PidHashNameLen(m_tempChBufSecondary.m_prgch, m_tempChBufSecondary.m_ichCur);
  2108. }
  2109. else
  2110. {
  2111. return nullptr;
  2112. }
  2113. }
  2114. template <typename EncodingPolicy>
  2115. LPCOLESTR Scanner<EncodingPolicy>::StringFromLong(long lw)
  2116. {
  2117. _ltow_s(lw, m_tempChBuf.m_prgch, m_tempChBuf.m_cchMax, 10);
  2118. return m_tempChBuf.m_prgch;
  2119. }
  2120. template <typename EncodingPolicy>
  2121. IdentPtr Scanner<EncodingPolicy>::PidFromLong(long lw)
  2122. {
  2123. return m_phtbl->PidHashName(StringFromLong(lw));
  2124. }
  2125. template <typename EncodingPolicy>
  2126. LPCOLESTR Scanner<EncodingPolicy>::StringFromDbl(double dbl)
  2127. {
  2128. if (!Js::NumberUtilities::FDblToStr(dbl, m_tempChBuf.m_prgch, m_tempChBuf.m_cchMax))
  2129. {
  2130. Error(ERRnoMemory);
  2131. }
  2132. return m_tempChBuf.m_prgch;
  2133. }
  2134. template <typename EncodingPolicy>
  2135. IdentPtr Scanner<EncodingPolicy>::PidFromDbl(double dbl)
  2136. {
  2137. return m_phtbl->PidHashName(StringFromDbl(dbl));
  2138. }
  2139. template <typename EncodingPolicy>
  2140. void Scanner<EncodingPolicy>::Capture(_Out_ RestorePoint* restorePoint)
  2141. {
  2142. Capture(restorePoint, 0, 0);
  2143. }
  2144. template <typename EncodingPolicy>
  2145. void Scanner<EncodingPolicy>::Capture(_Out_ RestorePoint* restorePoint, uint functionIdIncrement, size_t lengthDecr)
  2146. {
  2147. restorePoint->m_ichMinTok = this->IchMinTok();
  2148. restorePoint->m_ichMinLine = this->IchMinLine();
  2149. restorePoint->m_cMinTokMultiUnits = this->m_cMinTokMultiUnits;
  2150. restorePoint->m_cMinLineMultiUnits = this->m_cMinLineMultiUnits;
  2151. restorePoint->m_line = this->m_line;
  2152. restorePoint->m_fHadEol = this->m_fHadEol;
  2153. restorePoint->functionIdIncrement = functionIdIncrement;
  2154. restorePoint->lengthDecr = lengthDecr;
  2155. #ifdef DEBUG
  2156. restorePoint->m_cMultiUnits = this->m_cMultiUnits;
  2157. #endif
  2158. }
  2159. template <typename EncodingPolicy>
  2160. void Scanner<EncodingPolicy>::SeekTo(const RestorePoint& restorePoint)
  2161. {
  2162. SeekAndScan<false>(restorePoint);
  2163. }
  2164. template <typename EncodingPolicy>
  2165. void Scanner<EncodingPolicy>::SeekToForcingPid(const RestorePoint& restorePoint)
  2166. {
  2167. SeekAndScan<true>(restorePoint);
  2168. }
  2169. template <typename EncodingPolicy>
  2170. template <bool forcePid>
  2171. void Scanner<EncodingPolicy>::SeekAndScan(const RestorePoint& restorePoint)
  2172. {
  2173. this->m_currentCharacter = this->m_pchBase + restorePoint.m_ichMinTok + restorePoint.m_cMinTokMultiUnits;
  2174. this->m_pchMinLine = this->m_pchBase + restorePoint.m_ichMinLine + restorePoint.m_cMinLineMultiUnits;
  2175. this->m_cMinLineMultiUnits = restorePoint.m_cMinLineMultiUnits;
  2176. this->RestoreMultiUnits(restorePoint.m_cMinTokMultiUnits);
  2177. if (forcePid)
  2178. {
  2179. this->ScanForcingPid();
  2180. }
  2181. else
  2182. {
  2183. this->Scan();
  2184. }
  2185. this->m_line = restorePoint.m_line;
  2186. this->m_fHadEol = restorePoint.m_fHadEol;
  2187. this->m_parser->ReduceDeferredScriptLength(restorePoint.lengthDecr);
  2188. Assert(this->m_cMultiUnits == restorePoint.m_cMultiUnits);
  2189. }
  2190. template <typename EncodingPolicy>
  2191. void Scanner<EncodingPolicy>::SeekTo(const RestorePoint& restorePoint, uint *nextFunctionId)
  2192. {
  2193. SeekTo(restorePoint);
  2194. *nextFunctionId += restorePoint.functionIdIncrement;
  2195. }
  2196. // Called by CompileScriptException::ProcessError to retrieve a BSTR for the line on which an error occurred.
  2197. template<typename EncodingPolicy>
  2198. HRESULT Scanner<EncodingPolicy>::SysAllocErrorLine(long ichMinLine, __out BSTR* pbstrLine)
  2199. {
  2200. if( !pbstrLine )
  2201. {
  2202. return E_POINTER;
  2203. }
  2204. // If we overflow the string, we have a serious problem...
  2205. if (ichMinLine < 0 || static_cast<size_t>(ichMinLine) > AdjustedLength() )
  2206. {
  2207. return E_UNEXPECTED;
  2208. }
  2209. EncodedCharPtr pStart = static_cast<size_t>(ichMinLine) == IchMinLine() ? m_pchMinLine : m_pchBase + CharacterOffsetToUnitOffset(m_pchBase, m_currentCharacter, m_pchLast, ichMinLine);
  2210. EncodedCharPtr pEnd = AdjustedLast();
  2211. // Determine the length by scanning for the next newline
  2212. charcount_t cch = LineLength(pStart, pEnd);
  2213. Assert(cch <= LONG_MAX);
  2214. *pbstrLine = SysAllocStringLen(NULL, cch);
  2215. if (!*pbstrLine)
  2216. {
  2217. return E_OUTOFMEMORY;
  2218. }
  2219. ConvertToUnicode(*pbstrLine, cch, pStart);
  2220. return S_OK;
  2221. }
  2222. template class Scanner<NullTerminatedUnicodeEncodingPolicy>;
  2223. template class Scanner<NullTerminatedUTF8EncodingPolicy>;
  2224. template class Scanner<NotNullTerminatedUTF8EncodingPolicy>;