Scan.cpp 77 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #include "ParserPch.h"
  6. /*****************************************************************************
  7. *
  8. * The following table speeds various tests of characters, such as whether
  9. * a given character can be part of an identifier, and so on.
  10. */
  11. int CountNewlines(LPCOLESTR psz, int cch)
  12. {
  13. int cln = 0;
  14. while (0 != *psz && 0 != cch--)
  15. {
  16. switch (*psz++)
  17. {
  18. case _u('\xD'):
  19. if (*psz == _u('\xA'))
  20. {
  21. ++psz;
  22. if (0 == cch--)
  23. break;
  24. }
  25. // fall-through
  26. case _u('\xA'):
  27. cln++;
  28. break;
  29. }
  30. }
  31. return cln;
  32. }
  33. template< typename CharT >
  34. struct AorW
  35. {
  36. };
  37. // Specialization for UTF8Char
  38. template<>
  39. struct AorW< UTF8Char >
  40. {
  41. // Expressing the args as "arrays of size N" ensures that the both args
  42. // are the same length. If not, we get a compile time error.
  43. template< size_t N >
  44. static const UTF8Char* Choose( const char (&a)[N], const char16 (&w)[N] )
  45. {
  46. // The reinterpret_cast is necessary to go from signed to unsigned char
  47. return reinterpret_cast< const UTF8Char* >(a);
  48. }
  49. template< size_t N >
  50. static const bool Test(const char (&a)[N], const char16 (&w)[N], LPCUTF8 value)
  51. {
  52. return 0 == memcmp(a, value, (N - 1) * sizeof(utf8char_t));
  53. }
  54. template< size_t N >
  55. static const bool Test(const char (&a)[N], const char16 (&w)[N], LPCUTF8 start, LPCUTF8 end)
  56. {
  57. return (end - start == N - 1) && (0 == memcmp(a, start, (N - 1) * sizeof(utf8char_t)));
  58. }
  59. };
  60. // Specialization for OLECHAR
  61. template<>
  62. struct AorW< OLECHAR >
  63. {
  64. template< size_t N >
  65. static const char16* Choose( const char (&a)[N], const char16 (&w)[N] )
  66. {
  67. return w;
  68. }
  69. template < size_t N >
  70. static bool Test(const char (&a)[N], const char16 (&w)[N], const char16 *value)
  71. {
  72. return 0 == memcmp(w, value, (N - 1) * sizeof(char16));
  73. }
  74. template < size_t N >
  75. static bool Test(const char (&a)[N], const char16 (&w)[N], const char16 *start, const char16 *end)
  76. {
  77. return (end - start == N - 1) && (0 == memcmp(w, start, (N - 1) * sizeof(char16)));
  78. }
  79. };
  80. BOOL Token::IsKeyword() const
  81. {
  82. // keywords (but not future reserved words)
  83. return (tk <= tkYIELD);
  84. }
  85. tokens Token::SetRegex(UnifiedRegex::RegexPattern *const pattern, Parser *const parser)
  86. {
  87. Assert(parser);
  88. if(pattern)
  89. parser->RegisterRegexPattern(pattern);
  90. this->u.pattern = pattern;
  91. return tk = tkRegExp;
  92. }
  93. IdentPtr Token::CreateIdentifier(HashTbl * hashTbl)
  94. {
  95. Assert(this->u.pid == nullptr);
  96. if (this->u.pchMin)
  97. {
  98. Assert(IsIdentifier());
  99. IdentPtr pid = hashTbl->PidHashNameLen(this->u.pchMin, this->u.pchMin + this->u.length, this->u.length);
  100. this->u.pid = pid;
  101. return pid;
  102. }
  103. Assert(IsReservedWord());
  104. IdentPtr pid = hashTbl->PidFromTk(tk);
  105. this->u.pid = pid;
  106. return pid;
  107. }
  108. template <typename EncodingPolicy>
  109. Scanner<EncodingPolicy>::Scanner(Parser* parser, HashTbl *phtbl, Token *ptoken, ErrHandler *perr, Js::ScriptContext* scriptContext)
  110. {
  111. AssertMem(phtbl);
  112. AssertMem(ptoken);
  113. AssertMem(perr);
  114. m_parser = parser;
  115. m_phtbl = phtbl;
  116. m_ptoken = ptoken;
  117. m_cMinLineMultiUnits = 0;
  118. m_perr = perr;
  119. m_fHadEol = FALSE;
  120. m_doubleQuoteOnLastTkStrCon = FALSE;
  121. m_OctOrLeadingZeroOnLastTKNumber = false;
  122. m_fStringTemplateDepth = 0;
  123. m_scanState = ScanStateNormal;
  124. m_scriptContext = scriptContext;
  125. m_line = 0;
  126. m_startLine = 0;
  127. m_pchStartLine = NULL;
  128. m_ichMinError = 0;
  129. m_ichLimError = 0;
  130. m_tempChBuf.m_pscanner = this;
  131. m_tempChBufSecondary.m_pscanner = this;
  132. m_iecpLimTokPrevious = (size_t)-1;
  133. this->charClassifier = scriptContext->GetCharClassifier();
  134. this->es6UnicodeMode = scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled();
  135. m_fYieldIsKeyword = false;
  136. m_fAwaitIsKeyword = false;
  137. }
  138. template <typename EncodingPolicy>
  139. Scanner<EncodingPolicy>::~Scanner(void)
  140. {
  141. }
  142. /*****************************************************************************
  143. *
  144. * Initializes the scanner to prepare to scan the given source text.
  145. */
  146. template <typename EncodingPolicy>
  147. void Scanner<EncodingPolicy>::SetText(EncodedCharPtr pszSrc, size_t offset, size_t length, charcount_t charOffset, ULONG grfscr, ULONG lineNumber)
  148. {
  149. // Save the start of the script and add the offset to get the point where we should start scanning.
  150. m_pchBase = pszSrc;
  151. m_pchLast = m_pchBase + offset + length;
  152. m_pchPrevLine = m_currentCharacter = m_pchMinLine = m_pchMinTok = pszSrc + offset;
  153. this->RestoreMultiUnits(offset - charOffset);
  154. // Absorb any byte order mark at the start
  155. if(offset == 0)
  156. {
  157. switch( this->PeekFull(m_currentCharacter, m_pchLast) )
  158. {
  159. case 0xFFEE: // "Opposite" endian BOM
  160. // We do not support big-endian encodings
  161. // fall-through
  162. case 0xFEFF: // "Correct" BOM
  163. this->template ReadFull<true>(m_currentCharacter, m_pchLast);
  164. break;
  165. }
  166. }
  167. m_line = lineNumber;
  168. m_startLine = lineNumber;
  169. m_pchStartLine = m_currentCharacter;
  170. m_ptoken->tk = tkNone;
  171. m_fIsModuleCode = (grfscr & fscrIsModuleCode) != 0;
  172. m_fHadEol = FALSE;
  173. m_fSyntaxColor = (grfscr & fscrSyntaxColor) != 0;
  174. m_DeferredParseFlags = ScanFlagNone;
  175. }
  176. template <typename EncodingPolicy>
  177. void Scanner<EncodingPolicy>::PrepareForBackgroundParse(Js::ScriptContext *scriptContext)
  178. {
  179. scriptContext->GetThreadContext()->GetStandardChars((EncodedChar*)0);
  180. scriptContext->GetThreadContext()->GetStandardChars((char16*)0);
  181. }
  182. //-----------------------------------------------------------------------------
  183. // Number of code points from 'first' up to, but not including the next
  184. // newline character, embedded NUL, or 'last', depending on which comes first.
  185. //
  186. // This is used to determine a length of BSTR, which can't contain a NUL character.
  187. //-----------------------------------------------------------------------------
  188. template <typename EncodingPolicy>
  189. charcount_t Scanner<EncodingPolicy>::LineLength(EncodedCharPtr first, EncodedCharPtr last)
  190. {
  191. charcount_t result = 0;
  192. EncodedCharPtr p = first;
  193. for (;;)
  194. {
  195. switch( this->template ReadFull<false>(p, last) )
  196. {
  197. case kchNWL: // _C_NWL
  198. case kchRET:
  199. case kchLS:
  200. case kchPS:
  201. case kchNUL: // _C_NUL
  202. return result;
  203. }
  204. result++;
  205. }
  206. }
  207. template <typename EncodingPolicy>
  208. charcount_t Scanner<EncodingPolicy>::UpdateLine(int32 &line, EncodedCharPtr start, EncodedCharPtr last, charcount_t ichStart, charcount_t ichEnd)
  209. {
  210. EncodedCharPtr p = start;
  211. charcount_t ich = ichStart;
  212. int32 current = line;
  213. charcount_t lastStart = ichStart;
  214. while (ich < ichEnd)
  215. {
  216. ich++;
  217. switch (this->template ReadFull<false>(p, last))
  218. {
  219. case kchRET:
  220. if (this->PeekFull(p, last) == kchNWL)
  221. {
  222. ich++;
  223. this->template ReadFull<false>(p, last);
  224. }
  225. // fall-through
  226. case kchNWL:
  227. case kchLS:
  228. case kchPS:
  229. current++;
  230. lastStart = ich;
  231. break;
  232. case kchNUL:
  233. goto done;
  234. }
  235. }
  236. done:
  237. line = current;
  238. return lastStart;
  239. }
  240. template <typename EncodingPolicy>
  241. bool Scanner<EncodingPolicy>::TryReadEscape(EncodedCharPtr& startingLocation, EncodedCharPtr endOfSource, codepoint_t *outChar)
  242. {
  243. Assert(outChar != nullptr);
  244. Assert(startingLocation <= endOfSource);
  245. EncodedCharPtr currentLocation = startingLocation;
  246. codepoint_t charToOutput = 0x0;
  247. // '\' is Assumed as there is only one caller
  248. // Read 'u' characters
  249. if (currentLocation >= endOfSource || this->ReadFirst(currentLocation, endOfSource) != 'u')
  250. {
  251. return false;
  252. }
  253. bool expectCurly = false;
  254. if (currentLocation < endOfSource && this->PeekFirst(currentLocation, endOfSource) == '{' && es6UnicodeMode)
  255. {
  256. expectCurly = true;
  257. // Move past the character
  258. this->ReadFirst(currentLocation, endOfSource);
  259. }
  260. uint i = 0;
  261. OLECHAR ch = 0;
  262. int hexValue = 0;
  263. uint maxHexDigits = (expectCurly ? MAXUINT32 : 4u);
  264. for(; i < maxHexDigits && currentLocation < endOfSource; i++)
  265. {
  266. if (!Js::NumberUtilities::FHexDigit(ch = this->ReadFirst(currentLocation, endOfSource), &hexValue))
  267. {
  268. break;
  269. }
  270. charToOutput = charToOutput * 0x10 + hexValue;
  271. if (charToOutput > 0x10FFFF)
  272. {
  273. return false;
  274. }
  275. }
  276. //At least 4 characters have to be read
  277. if (i == 0 || (i != 4 && !expectCurly))
  278. {
  279. return false;
  280. }
  281. Assert(expectCurly ? es6UnicodeMode : true);
  282. if (expectCurly && ch != '}')
  283. {
  284. return false;
  285. }
  286. *outChar = charToOutput;
  287. startingLocation = currentLocation;
  288. return true;
  289. }
  290. template <typename EncodingPolicy>
  291. template <bool bScan>
  292. bool Scanner<EncodingPolicy>::TryReadCodePointRest(codepoint_t lower, EncodedCharPtr& startingLocation, EncodedCharPtr endOfSource, codepoint_t *outChar, bool *outContainsMultiUnitChar)
  293. {
  294. Assert(outChar != nullptr);
  295. Assert(outContainsMultiUnitChar != nullptr);
  296. Assert(es6UnicodeMode);
  297. Assert(Js::NumberUtilities::IsSurrogateLowerPart(lower));
  298. EncodedCharPtr currentLocation = startingLocation;
  299. *outChar = lower;
  300. if (currentLocation < endOfSource)
  301. {
  302. size_t restorePoint = this->m_cMultiUnits;
  303. codepoint_t upper = this->template ReadFull<bScan>(currentLocation, endOfSource);
  304. if (Js::NumberUtilities::IsSurrogateUpperPart(upper))
  305. {
  306. *outChar = Js::NumberUtilities::SurrogatePairAsCodePoint(lower, upper);
  307. if (this->IsMultiUnitChar(static_cast<OLECHAR>(upper)))
  308. {
  309. *outContainsMultiUnitChar = true;
  310. }
  311. startingLocation = currentLocation;
  312. }
  313. else
  314. {
  315. this->RestoreMultiUnits(restorePoint);
  316. }
  317. }
  318. return true;
  319. }
  320. template <typename EncodingPolicy>
  321. template <bool bScan>
  322. inline bool Scanner<EncodingPolicy>::TryReadCodePoint(EncodedCharPtr &startingLocation, EncodedCharPtr endOfSource, codepoint_t *outChar, bool *hasEscape, bool *outContainsMultiUnitChar)
  323. {
  324. Assert(outChar != nullptr);
  325. Assert(outContainsMultiUnitChar != nullptr);
  326. if (startingLocation >= endOfSource)
  327. {
  328. return false;
  329. }
  330. codepoint_t ch = this->template ReadFull<bScan>(startingLocation, endOfSource);
  331. if (FBigChar(ch))
  332. {
  333. if (this->IsMultiUnitChar(static_cast<OLECHAR>(ch)))
  334. {
  335. *outContainsMultiUnitChar = true;
  336. }
  337. if (es6UnicodeMode && Js::NumberUtilities::IsSurrogateLowerPart(ch))
  338. {
  339. return TryReadCodePointRest<bScan>(ch, startingLocation, endOfSource, outChar, outContainsMultiUnitChar);
  340. }
  341. }
  342. else if (ch == '\\' && TryReadEscape(startingLocation, endOfSource, &ch))
  343. {
  344. *hasEscape = true;
  345. }
  346. *outChar = ch;
  347. return true;
  348. }
  349. template <typename EncodingPolicy>
  350. tokens Scanner<EncodingPolicy>::ScanIdentifier(bool identifyKwds, EncodedCharPtr *pp)
  351. {
  352. EncodedCharPtr p = *pp;
  353. EncodedCharPtr pchMin = p;
  354. // JS6 allows unicode characters in the form of \uxxxx escape sequences
  355. // to be part of the identifier.
  356. bool fHasEscape = false;
  357. bool fHasMultiChar = false;
  358. codepoint_t codePoint = INVALID_CODEPOINT;
  359. size_t multiUnitsBeforeLast = this->m_cMultiUnits;
  360. // Check if we started the id
  361. if (!TryReadCodePoint<true>(p, m_pchLast, &codePoint, &fHasEscape, &fHasMultiChar))
  362. {
  363. // If no chars. could be scanned as part of the identifier, return error.
  364. return tkScanError;
  365. }
  366. Assert(codePoint < 0x110000u);
  367. if (!charClassifier->IsIdStart(codePoint))
  368. {
  369. // Put back the last character
  370. this->RestoreMultiUnits(multiUnitsBeforeLast);
  371. // If no chars. could be scanned as part of the identifier, return error.
  372. return tkScanError;
  373. }
  374. return ScanIdentifierContinue(identifyKwds, fHasEscape, fHasMultiChar, pchMin, p, pp);
  375. }
  376. template <typename EncodingPolicy>
  377. BOOL Scanner<EncodingPolicy>::FastIdentifierContinue(EncodedCharPtr&p, EncodedCharPtr last)
  378. {
  379. if (EncodingPolicy::MultiUnitEncoding)
  380. {
  381. while (p < last)
  382. {
  383. EncodedChar currentChar = *p;
  384. if (this->IsMultiUnitChar(currentChar))
  385. {
  386. // multi unit character, we may not have reach the end yet
  387. return FALSE;
  388. }
  389. Assert(currentChar != '\\' || !charClassifier->IsIdContinueFast<false>(currentChar));
  390. if (!charClassifier->IsIdContinueFast<false>(currentChar))
  391. {
  392. // only reach the end of the identifier if it is not the start of an escape sequence
  393. return currentChar != '\\';
  394. }
  395. p++;
  396. }
  397. // We have reach the end of the identifier.
  398. return TRUE;
  399. }
  400. // Not fast path for non multi unit encoding
  401. return false;
  402. }
  403. template <typename EncodingPolicy>
  404. tokens Scanner<EncodingPolicy>::ScanIdentifierContinue(bool identifyKwds, bool fHasEscape, bool fHasMultiChar,
  405. EncodedCharPtr pchMin, EncodedCharPtr p, EncodedCharPtr *pp)
  406. {
  407. EncodedCharPtr last = m_pchLast;
  408. while (true)
  409. {
  410. // Fast path for utf8, non-multi unit char and not escape
  411. if (FastIdentifierContinue(p, last))
  412. {
  413. break;
  414. }
  415. // Slow path that has to deal with multi unit encoding
  416. codepoint_t codePoint = INVALID_CODEPOINT;
  417. EncodedCharPtr pchBeforeLast = p;
  418. size_t multiUnitsBeforeLast = this->m_cMultiUnits;
  419. if (TryReadCodePoint<true>(p, last, &codePoint, &fHasEscape, &fHasMultiChar))
  420. {
  421. Assert(codePoint < 0x110000u);
  422. if (charClassifier->IsIdContinue(codePoint))
  423. {
  424. continue;
  425. }
  426. }
  427. // Put back the last character
  428. p = pchBeforeLast;
  429. this->RestoreMultiUnits(multiUnitsBeforeLast);
  430. break;
  431. }
  432. Assert(p - pchMin > 0 && p - pchMin <= LONG_MAX);
  433. *pp = p;
  434. if (!identifyKwds)
  435. {
  436. return tkID;
  437. }
  438. // During syntax coloring, scanner doesn't need to convert the escape sequence to get actual characters, it just needs the classification information
  439. // So call up hashtables custom method to check if the string scanned is identifier or keyword.
  440. // Do the same for deferred parsing, but use a custom method that only tokenizes JS keywords.
  441. if ((m_DeferredParseFlags & ScanFlagSuppressIdPid) != 0)
  442. {
  443. m_ptoken->SetIdentifier(NULL);
  444. if (!fHasEscape)
  445. {
  446. // If there are no escape, that the main scan loop would have found the keyword already
  447. // So we can just assume it is an ID
  448. DebugOnly(int32 cch = UnescapeToTempBuf(pchMin, p));
  449. DebugOnly(tokens tk = m_phtbl->TkFromNameLen(m_tempChBuf.m_prgch, cch, IsStrictMode()));
  450. Assert(tk == tkID || (tk == tkYIELD && !m_fYieldIsKeyword) || (tk == tkAWAIT && !m_fAwaitIsKeyword));
  451. return tkID;
  452. }
  453. int32 cch = UnescapeToTempBuf(pchMin, p);
  454. tokens tk = m_phtbl->TkFromNameLen(m_tempChBuf.m_prgch, cch, IsStrictMode());
  455. return (!m_fYieldIsKeyword && tk == tkYIELD) || (!m_fAwaitIsKeyword && tk == tkAWAIT) ? tkID : tk;
  456. }
  457. else if (m_fSyntaxColor)
  458. {
  459. m_ptoken->SetIdentifier(NULL);
  460. // We always need to check TkFromNameLenColor because
  461. // the main Scan switch doesn't detect all non-keyword that needs coloring
  462. // (e.g. int)
  463. int32 cch = UnescapeToTempBuf(pchMin, p);
  464. return m_phtbl->TkFromNameLenColor(m_tempChBuf.m_prgch, cch);
  465. }
  466. // UTF16 Scanner are only for syntax coloring, so it shouldn't come here.
  467. if (EncodingPolicy::MultiUnitEncoding && !fHasMultiChar && !fHasEscape)
  468. {
  469. Assert(sizeof(EncodedChar) == 1);
  470. // If there are no escape, that the main scan loop would have found the keyword already
  471. // So we can just assume it is an ID
  472. DebugOnly(int32 cch = UnescapeToTempBuf(pchMin, p));
  473. DebugOnly(tokens tk = m_phtbl->TkFromNameLen(m_tempChBuf.m_prgch, cch, IsStrictMode()));
  474. Assert(tk == tkID || (tk == tkYIELD && !m_fYieldIsKeyword) || (tk == tkAWAIT && !m_fAwaitIsKeyword));
  475. m_ptoken->SetIdentifier(reinterpret_cast<const char *>(pchMin), (int32)(p - pchMin));
  476. return tkID;
  477. }
  478. IdentPtr pid = PidOfIdentiferAt(pchMin, p, fHasEscape, fHasMultiChar);
  479. m_ptoken->SetIdentifier(pid);
  480. if (!fHasEscape)
  481. {
  482. // If it doesn't have escape, then Scan() should have taken care of keywords (except
  483. // yield if m_fYieldIsKeyword is false, in which case yield is treated as an identifier, and except
  484. // await if m_fAwaitIsKeyword is false, in which case await is treated as an identifier).
  485. // We don't have to check if the name is reserved word and return it as an Identifier
  486. Assert(pid->Tk(IsStrictMode()) == tkID
  487. || (pid->Tk(IsStrictMode()) == tkYIELD && !m_fYieldIsKeyword)
  488. || (pid->Tk(IsStrictMode()) == tkAWAIT && !m_fAwaitIsKeyword));
  489. return tkID;
  490. }
  491. tokens tk = pid->Tk(IsStrictMode());
  492. return tk == tkID || (tk == tkYIELD && !m_fYieldIsKeyword) || (tk == tkAWAIT && !m_fAwaitIsKeyword) ? tkID : tkNone;
  493. }
  494. template <typename EncodingPolicy>
  495. IdentPtr Scanner<EncodingPolicy>::PidAt(size_t iecpMin, size_t iecpLim)
  496. {
  497. Assert(iecpMin < AdjustedLength() && iecpLim <= AdjustedLength() && iecpLim > iecpMin);
  498. return PidOfIdentiferAt(m_pchBase + iecpMin, m_pchBase + iecpLim);
  499. }
  500. template <typename EncodingPolicy>
  501. uint32 Scanner<EncodingPolicy>::UnescapeToTempBuf(EncodedCharPtr p, EncodedCharPtr last)
  502. {
  503. m_tempChBuf.Init();
  504. while( p < last )
  505. {
  506. codepoint_t codePoint;
  507. bool hasEscape, isMultiChar;
  508. bool gotCodePoint = TryReadCodePoint<false>(p, last, &codePoint, &hasEscape, &isMultiChar);
  509. Assert(gotCodePoint);
  510. Assert(codePoint < 0x110000);
  511. if (codePoint < 0x10000)
  512. {
  513. m_tempChBuf.AppendCh((OLECHAR)codePoint);
  514. }
  515. else
  516. {
  517. char16 lower, upper;
  518. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &lower, &upper);
  519. m_tempChBuf.AppendCh(lower);
  520. m_tempChBuf.AppendCh(upper);
  521. }
  522. }
  523. return m_tempChBuf.m_ichCur;
  524. }
  525. template <typename EncodingPolicy>
  526. IdentPtr Scanner<EncodingPolicy>::PidOfIdentiferAt(EncodedCharPtr p, EncodedCharPtr last)
  527. {
  528. int32 cch = UnescapeToTempBuf(p, last);
  529. return m_phtbl->PidHashNameLen(m_tempChBuf.m_prgch, cch);
  530. }
  531. template <typename EncodingPolicy>
  532. IdentPtr Scanner<EncodingPolicy>::PidOfIdentiferAt(EncodedCharPtr p, EncodedCharPtr last, bool fHadEscape, bool fHasMultiChar)
  533. {
  534. // If there is an escape sequence in the JS6 identifier or it is a UTF8
  535. // source then we have to convert it to the equivalent char so we use a
  536. // buffer for translation.
  537. if ((EncodingPolicy::MultiUnitEncoding && fHasMultiChar) || fHadEscape)
  538. {
  539. return PidOfIdentiferAt(p, last);
  540. }
  541. else if (EncodingPolicy::MultiUnitEncoding)
  542. {
  543. Assert(sizeof(EncodedChar) == 1);
  544. return m_phtbl->PidHashNameLen(reinterpret_cast<const char *>(p), reinterpret_cast<const char *>(last), (int32)(last - p));
  545. }
  546. else
  547. {
  548. Assert(sizeof(EncodedChar) == 2);
  549. return m_phtbl->PidHashNameLen(reinterpret_cast< const char16 * >(p), (int32)(last - p));
  550. }
  551. }
  552. template <typename EncodingPolicy>
  553. typename Scanner<EncodingPolicy>::EncodedCharPtr Scanner<EncodingPolicy>::FScanNumber(EncodedCharPtr p, double *pdbl, bool& likelyInt)
  554. {
  555. EncodedCharPtr last = m_pchLast;
  556. EncodedCharPtr pchT;
  557. likelyInt = true;
  558. // Reset
  559. m_OctOrLeadingZeroOnLastTKNumber = false;
  560. if ('0' == this->PeekFirst(p, last))
  561. {
  562. switch(this->PeekFirst(p + 1, last))
  563. {
  564. case '.':
  565. case 'e':
  566. case 'E':
  567. likelyInt = false;
  568. // Floating point
  569. goto LFloat;
  570. case 'x':
  571. case 'X':
  572. // Hex
  573. *pdbl = Js::NumberUtilities::DblFromHex(p + 2, &pchT);
  574. if (pchT == p + 2)
  575. {
  576. // "Octal zero token "0" followed by an identifier token beginning with character 'x'/'X'
  577. *pdbl = 0;
  578. return p + 1;
  579. }
  580. else
  581. return pchT;
  582. case 'o':
  583. case 'O':
  584. // Octal
  585. *pdbl = Js::NumberUtilities::DblFromOctal(p + 2, &pchT);
  586. if (pchT == p + 2)
  587. {
  588. // "Octal zero token "0" followed by an identifier token beginning with character 'o'/'O'
  589. *pdbl = 0;
  590. return p + 1;
  591. }
  592. return pchT;
  593. case 'b':
  594. case 'B':
  595. // Binary
  596. *pdbl = Js::NumberUtilities::DblFromBinary(p + 2, &pchT);
  597. if (pchT == p + 2)
  598. {
  599. // "Octal zero token "0" followed by an identifier token beginning with character 'b'/'B'
  600. *pdbl = 0;
  601. return p + 1;
  602. }
  603. return pchT;
  604. default:
  605. // Octal
  606. *pdbl = Js::NumberUtilities::DblFromOctal(p, &pchT);
  607. Assert(pchT > p);
  608. #if !SOURCERELEASE
  609. // If an octal literal is malformed then it is in fact a decimal literal.
  610. #endif // !SOURCERELEASE
  611. if(*pdbl != 0 || pchT > p + 1)
  612. m_OctOrLeadingZeroOnLastTKNumber = true; //report as an octal or hex for JSON when leading 0. Just '0' is ok
  613. switch (*pchT)
  614. {
  615. case '8':
  616. case '9':
  617. // case 'e':
  618. // case 'E':
  619. // case '.':
  620. m_OctOrLeadingZeroOnLastTKNumber = false; //08... or 09....
  621. goto LFloat;
  622. }
  623. return pchT;
  624. }
  625. }
  626. else
  627. {
  628. LFloat:
  629. *pdbl = Js::NumberUtilities::StrToDbl(p, &pchT, likelyInt);
  630. Assert(pchT == p || !Js::NumberUtilities::IsNan(*pdbl));
  631. return pchT;
  632. }
  633. }
  634. template <typename EncodingPolicy>
  635. BOOL Scanner<EncodingPolicy>::oFScanNumber(double *pdbl, bool& likelyInt)
  636. {
  637. EncodedCharPtr pchT;
  638. m_OctOrLeadingZeroOnLastTKNumber = false;
  639. likelyInt = true;
  640. if ('0' == *m_currentCharacter)
  641. {
  642. switch (m_currentCharacter[1])
  643. {
  644. case '.':
  645. case 'e':
  646. case 'E':
  647. likelyInt = false;
  648. // Floating point.
  649. goto LFloat;
  650. case 'x':
  651. case 'X':
  652. // Hex.
  653. *pdbl = Js::NumberUtilities::DblFromHex<EncodedChar>(m_currentCharacter + 2, &pchT);
  654. if (pchT == m_currentCharacter + 2)
  655. {
  656. // "Octal zero token "0" followed by an identifier token beginning with character 'x'/'X'
  657. *pdbl = 0;
  658. m_currentCharacter++;
  659. }
  660. else
  661. m_currentCharacter = pchT;
  662. break;
  663. case 'o':
  664. case 'O':
  665. *pdbl = Js::NumberUtilities::DblFromOctal(m_currentCharacter + 2, &pchT);
  666. if (pchT == m_currentCharacter + 2)
  667. {
  668. // "Octal zero token "0" followed by an identifier token beginning with character 'o'/'O'
  669. *pdbl = 0;
  670. m_currentCharacter++;
  671. }
  672. else
  673. m_currentCharacter = pchT;
  674. break;
  675. case 'b':
  676. case 'B':
  677. *pdbl = Js::NumberUtilities::DblFromBinary(m_currentCharacter + 2, &pchT);
  678. if (pchT == m_currentCharacter + 2)
  679. {
  680. // "Octal zero token "0" followed by an identifier token beginning with character 'b'/'B'
  681. *pdbl = 0;
  682. m_currentCharacter++;
  683. }
  684. else
  685. m_currentCharacter = pchT;
  686. break;
  687. default:
  688. // Octal.
  689. *pdbl = Js::NumberUtilities::DblFromOctal(m_currentCharacter, &pchT);
  690. Assert(pchT > m_currentCharacter);
  691. #if !SOURCERELEASE
  692. // If an octal literal is malformed then it is in fact a decimal literal.
  693. #endif // !SOURCERELEASE
  694. if(*pdbl != 0 || pchT > m_currentCharacter + 1)
  695. m_OctOrLeadingZeroOnLastTKNumber = true; //report as an octal or hex for JSON when leading 0. Just '0' is ok
  696. switch (*pchT)
  697. {
  698. case '8':
  699. case '9':
  700. // case 'e':
  701. // case 'E':
  702. // case '.':
  703. m_OctOrLeadingZeroOnLastTKNumber = false; //08... or 09....
  704. goto LFloat;
  705. }
  706. m_currentCharacter = pchT;
  707. break;
  708. }
  709. }
  710. else
  711. {
  712. LFloat:
  713. // Let StrToDbl do all the work.
  714. *pdbl = Js::NumberUtilities::StrToDbl(m_currentCharacter, &pchT, likelyInt);
  715. if (pchT == m_currentCharacter)
  716. return FALSE;
  717. m_currentCharacter = pchT;
  718. Assert(!Js::NumberUtilities::IsNan(*pdbl));
  719. }
  720. return TRUE;
  721. }
  722. template <typename EncodingPolicy>
  723. tokens Scanner<EncodingPolicy>::TryRescanRegExp()
  724. {
  725. EncodedCharPtr current = m_currentCharacter;
  726. tokens result = RescanRegExp();
  727. if (result == tkScanError)
  728. m_currentCharacter = current;
  729. return result;
  730. }
  731. template <typename EncodingPolicy>
  732. tokens Scanner<EncodingPolicy>::RescanRegExp()
  733. {
  734. #if DEBUG
  735. switch (m_ptoken->tk)
  736. {
  737. case tkDiv:
  738. Assert(m_currentCharacter == m_pchMinTok + 1);
  739. break;
  740. case tkAsgDiv:
  741. Assert(m_currentCharacter == m_pchMinTok + 2);
  742. break;
  743. default:
  744. AssertMsg(FALSE, "Who is calling RescanRegExp?");
  745. break;
  746. }
  747. #endif //DEBUG
  748. m_currentCharacter = m_pchMinTok;
  749. if (*m_currentCharacter != '/')
  750. Error(ERRnoSlash);
  751. m_currentCharacter++;
  752. tokens tk = tkNone;
  753. {
  754. ArenaAllocator alloc(_u("RescanRegExp"), m_parser->GetAllocator()->GetPageAllocator(), m_parser->GetAllocator()->outOfMemoryFunc);
  755. tk = ScanRegExpConstant(&alloc);
  756. }
  757. return tk;
  758. }
  759. template <typename EncodingPolicy>
  760. tokens Scanner<EncodingPolicy>::RescanRegExpNoAST()
  761. {
  762. #if DEBUG
  763. switch (m_ptoken->tk)
  764. {
  765. case tkDiv:
  766. Assert(m_currentCharacter == m_pchMinTok + 1);
  767. break;
  768. case tkAsgDiv:
  769. Assert(m_currentCharacter == m_pchMinTok + 2);
  770. break;
  771. default:
  772. AssertMsg(FALSE, "Who is calling RescanRegExpNoParseTree?");
  773. break;
  774. }
  775. #endif //DEBUG
  776. m_currentCharacter = m_pchMinTok;
  777. if (*m_currentCharacter != '/')
  778. Error(ERRnoSlash);
  779. m_currentCharacter++;
  780. tokens tk = tkNone;
  781. {
  782. ArenaAllocator alloc(_u("RescanRegExp"), m_parser->GetAllocator()->GetPageAllocator(), m_parser->GetAllocator()->outOfMemoryFunc);
  783. {
  784. tk = ScanRegExpConstantNoAST(&alloc);
  785. }
  786. }
  787. return tk;
  788. }
  789. template <typename EncodingPolicy>
  790. tokens Scanner<EncodingPolicy>::RescanRegExpTokenizer()
  791. {
  792. #if DEBUG
  793. switch (m_ptoken->tk)
  794. {
  795. case tkDiv:
  796. Assert(m_currentCharacter == m_pchMinTok + 1);
  797. break;
  798. case tkAsgDiv:
  799. Assert(m_currentCharacter == m_pchMinTok + 2);
  800. break;
  801. default:
  802. AssertMsg(FALSE, "Who is calling RescanRegExpNoParseTree?");
  803. break;
  804. }
  805. #endif //DEBUG
  806. m_currentCharacter = m_pchMinTok;
  807. if (*m_currentCharacter != '/')
  808. Error(ERRnoSlash);
  809. m_currentCharacter++;
  810. tokens tk = tkNone;
  811. ThreadContext *threadContext = ThreadContext::GetContextForCurrentThread();
  812. threadContext->EnsureRecycler();
  813. Js::TempArenaAllocatorObject *alloc = threadContext->GetTemporaryAllocator(_u("RescanRegExp"));
  814. TryFinally(
  815. [&]() /* try block */
  816. {
  817. tk = this->ScanRegExpConstantNoAST(alloc->GetAllocator());
  818. },
  819. [&](bool /* hasException */) /* finally block */
  820. {
  821. threadContext->ReleaseTemporaryAllocator(alloc);
  822. });
  823. return tk;
  824. }
  825. template <typename EncodingPolicy>
  826. tokens Scanner<EncodingPolicy>::ScanRegExpConstant(ArenaAllocator* alloc)
  827. {
  828. if (m_parser && m_parser->IsBackgroundParser())
  829. {
  830. PROBE_STACK_NO_DISPOSE(m_scriptContext, Js::Constants::MinStackRegex);
  831. }
  832. else
  833. {
  834. PROBE_STACK(m_scriptContext, Js::Constants::MinStackRegex);
  835. }
  836. // SEE ALSO: RegexHelper::PrimCompileDynamic()
  837. #ifdef PROFILE_EXEC
  838. m_scriptContext->ProfileBegin(Js::RegexCompilePhase);
  839. #endif
  840. ArenaAllocator* ctAllocator = alloc;
  841. UnifiedRegex::StandardChars<EncodedChar>* standardEncodedChars = m_scriptContext->GetThreadContext()->GetStandardChars((EncodedChar*)0);
  842. UnifiedRegex::StandardChars<char16>* standardChars = m_scriptContext->GetThreadContext()->GetStandardChars((char16*)0);
  843. #if ENABLE_REGEX_CONFIG_OPTIONS
  844. UnifiedRegex::DebugWriter *w = 0;
  845. if (REGEX_CONFIG_FLAG(RegexDebug))
  846. w = m_scriptContext->GetRegexDebugWriter();
  847. if (REGEX_CONFIG_FLAG(RegexProfile))
  848. m_scriptContext->GetRegexStatsDatabase()->BeginProfile();
  849. #endif
  850. UnifiedRegex::Node* root = 0;
  851. charcount_t totalLen = 0, bodyChars = 0, totalChars = 0, bodyLen = 0;
  852. UnifiedRegex::RegexFlags flags = UnifiedRegex::NoRegexFlags;
  853. UnifiedRegex::Parser<EncodingPolicy, true> parser
  854. ( m_scriptContext
  855. , ctAllocator
  856. , standardEncodedChars
  857. , standardChars
  858. , this->IsFromExternalSource()
  859. #if ENABLE_REGEX_CONFIG_OPTIONS
  860. , w
  861. #endif
  862. );
  863. try
  864. {
  865. root = parser.ParseLiteral(m_currentCharacter, m_pchLast, bodyLen, totalLen, bodyChars, totalChars, flags);
  866. }
  867. catch (UnifiedRegex::ParseError e)
  868. {
  869. #ifdef PROFILE_EXEC
  870. m_scriptContext->ProfileEnd(Js::RegexCompilePhase);
  871. #endif
  872. if (m_fSyntaxColor)
  873. return ScanError(m_currentCharacter + e.encodedPos, tkRegExp);
  874. m_currentCharacter += e.encodedPos;
  875. Error(e.error);
  876. }
  877. UnifiedRegex::RegexPattern* pattern;
  878. if (m_parser->IsBackgroundParser())
  879. {
  880. // Avoid allocating pattern from recycler on background thread. The main thread will create the pattern
  881. // and hook it to this parse node.
  882. pattern = parser.template CompileProgram<false>(root, m_currentCharacter, totalLen, bodyChars, bodyLen, totalChars, flags);
  883. }
  884. else
  885. {
  886. pattern = parser.template CompileProgram<true>(root, m_currentCharacter, totalLen, bodyChars, bodyLen, totalChars, flags);
  887. }
  888. this->RestoreMultiUnits(this->m_cMultiUnits + parser.GetMultiUnits()); // m_currentCharacter changed, sync MultiUnits
  889. return m_ptoken->SetRegex(pattern, m_parser);
  890. }
  891. template<typename EncodingPolicy>
  892. tokens Scanner<EncodingPolicy>::ScanRegExpConstantNoAST(ArenaAllocator* alloc)
  893. {
  894. if (m_parser && m_parser->IsBackgroundParser())
  895. {
  896. PROBE_STACK_NO_DISPOSE(m_scriptContext, Js::Constants::MinStackRegex);
  897. }
  898. else
  899. {
  900. PROBE_STACK(m_scriptContext, Js::Constants::MinStackRegex);
  901. }
  902. ThreadContext *threadContext = m_fSyntaxColor ? ThreadContext::GetContextForCurrentThread() : m_scriptContext->GetThreadContext();
  903. UnifiedRegex::StandardChars<EncodedChar>* standardEncodedChars = threadContext->GetStandardChars((EncodedChar*)0);
  904. UnifiedRegex::StandardChars<char16>* standardChars = threadContext->GetStandardChars((char16*)0);
  905. charcount_t totalLen = 0, bodyChars = 0, totalChars = 0, bodyLen = 0;
  906. UnifiedRegex::Parser<EncodingPolicy, true> parser
  907. ( m_scriptContext
  908. , alloc
  909. , standardEncodedChars
  910. , standardChars
  911. , this->IsFromExternalSource()
  912. #if ENABLE_REGEX_CONFIG_OPTIONS
  913. , 0
  914. #endif
  915. );
  916. try
  917. {
  918. parser.ParseLiteralNoAST(m_currentCharacter, m_pchLast, bodyLen, totalLen, bodyChars, totalChars);
  919. }
  920. catch (UnifiedRegex::ParseError e)
  921. {
  922. if (m_fSyntaxColor)
  923. return ScanError(m_currentCharacter + e.encodedPos, tkRegExp);
  924. m_currentCharacter += e.encodedPos;
  925. Error(e.error);
  926. // never reached
  927. }
  928. UnifiedRegex::RegexPattern* pattern = parser.template CompileProgram<false>(nullptr, m_currentCharacter, totalLen, bodyChars, bodyLen, totalChars, UnifiedRegex::NoRegexFlags);
  929. Assert(pattern == nullptr); // BuildAST == false, CompileProgram should return nullptr
  930. this->RestoreMultiUnits(this->m_cMultiUnits + parser.GetMultiUnits()); // m_currentCharacter changed, sync MultiUnits
  931. return (m_ptoken->tk = tkRegExp);
  932. }
  933. template<typename EncodingPolicy>
  934. tokens Scanner<EncodingPolicy>::ScanStringTemplateBegin(EncodedCharPtr *pp)
  935. {
  936. // String template must begin with a string constant followed by '`' or '${'
  937. ScanStringConstant<true, true>('`', pp);
  938. OLECHAR ch;
  939. EncodedCharPtr last = m_pchLast;
  940. ch = this->ReadFirst(*pp, last);
  941. if (ch == '`')
  942. {
  943. // Simple string template - no substitutions
  944. return tkStrTmplBasic;
  945. }
  946. else if (ch == '$')
  947. {
  948. ch = this->ReadFirst(*pp, last);
  949. if (ch == '{')
  950. {
  951. // Next token after expr should be tkStrTmplMid or tkStrTmplEnd.
  952. // In string template scanning mode, we expect the next char to be '}'
  953. // and will treat it as the beginning of tkStrTmplEnd or tkStrTmplMid
  954. m_fStringTemplateDepth++;
  955. // Regular string template begin - next is first substitution
  956. return tkStrTmplBegin;
  957. }
  958. }
  959. // Error - make sure pointer stays at the last character of the error token instead of after it in the error case
  960. (*pp)--;
  961. return ScanError(m_currentCharacter, tkStrTmplBegin);
  962. }
  963. template<typename EncodingPolicy>
  964. tokens Scanner<EncodingPolicy>::ScanStringTemplateMiddleOrEnd(EncodedCharPtr *pp)
  965. {
  966. // String template middle and end tokens must begin with a string constant
  967. ScanStringConstant<true, true>('`', pp);
  968. OLECHAR ch;
  969. EncodedCharPtr last = m_pchLast;
  970. ch = this->ReadFirst(*pp, last);
  971. if (ch == '`')
  972. {
  973. // No longer in string template scanning mode
  974. m_fStringTemplateDepth--;
  975. // This is the last part of the template ...`
  976. return tkStrTmplEnd;
  977. }
  978. else if (ch == '$')
  979. {
  980. ch = this->ReadFirst(*pp, last);
  981. if (ch == '{')
  982. {
  983. // This is just another middle part of the template }...${
  984. return tkStrTmplMid;
  985. }
  986. }
  987. // Error - make sure pointer stays at the last character of the error token instead of after it in the error case
  988. (*pp)--;
  989. return ScanError(m_currentCharacter, tkStrTmplEnd);
  990. }
  991. /*****************************************************************************
  992. *
  993. * Parses a string constant. Note that the string value is stored in
  994. * a volatile buffer (or allocated on the heap if too long), and thus
  995. * the string should be saved off before the next token is scanned.
  996. */
  997. template<typename EncodingPolicy>
  998. template<bool stringTemplateMode, bool createRawString>
  999. tokens Scanner<EncodingPolicy>::ScanStringConstant(OLECHAR delim, EncodedCharPtr *pp)
  1000. {
  1001. static_assert((stringTemplateMode && createRawString) || (!stringTemplateMode && !createRawString), "stringTemplateMode and createRawString must have the same value");
  1002. OLECHAR ch, c, rawch;
  1003. int wT;
  1004. EncodedCharPtr p = *pp;
  1005. EncodedCharPtr last = m_pchLast;
  1006. // Reset
  1007. m_OctOrLeadingZeroOnLastTKNumber = false;
  1008. m_EscapeOnLastTkStrCon = FALSE;
  1009. m_tempChBuf.Init();
  1010. // Use template parameter to gate raw string creation.
  1011. // If createRawString is false, all these operations should be no-ops
  1012. if (createRawString)
  1013. {
  1014. m_tempChBufSecondary.Init();
  1015. }
  1016. for (;;)
  1017. {
  1018. switch ((rawch = ch = this->ReadFirst(p, last)))
  1019. {
  1020. case kchRET:
  1021. if (stringTemplateMode)
  1022. {
  1023. if (this->PeekFirst(p, last) == kchNWL)
  1024. {
  1025. // Eat the <LF> char, ignore return
  1026. this->ReadFirst(p, last);
  1027. }
  1028. // Both <CR> and <CR><LF> are normalized to <LF> in template cooked and raw values
  1029. ch = rawch = kchNWL;
  1030. }
  1031. LEcmaLineBreak:
  1032. // Fall through
  1033. case kchNWL:
  1034. if (stringTemplateMode)
  1035. {
  1036. // Notify the scanner to update current line, number of lines etc
  1037. NotifyScannedNewLine();
  1038. break;
  1039. }
  1040. m_currentCharacter = p - 1;
  1041. if (m_fSyntaxColor)
  1042. {
  1043. *pp = p - 1;
  1044. return ScanError(p - 1, tkStrCon);
  1045. }
  1046. Error(ERRnoStrEnd);
  1047. case '"':
  1048. case '\'':
  1049. if (ch == delim)
  1050. goto LBreak;
  1051. break;
  1052. case '`':
  1053. // In string template scan mode, don't consume the '`' - we need to differentiate
  1054. // between a closed string template and the expression open sequence - ${
  1055. if (stringTemplateMode)
  1056. {
  1057. p--;
  1058. goto LBreak;
  1059. }
  1060. // If we aren't scanning for a string template, do the default thing
  1061. goto LMainDefault;
  1062. case '$':
  1063. // If we are parsing a string literal part of a string template, ${ indicates we need to switch
  1064. // to parsing an expression.
  1065. if (stringTemplateMode && this->PeekFirst(p, last) == '{')
  1066. {
  1067. // Rewind to the $ and return
  1068. p--;
  1069. goto LBreak;
  1070. }
  1071. // If we aren't scanning for a string template, do the default thing
  1072. goto LMainDefault;
  1073. case kchNUL:
  1074. if (p >= last)
  1075. {
  1076. m_currentCharacter = p - 1;
  1077. if (m_fSyntaxColor)
  1078. {
  1079. *pp = p - 1;
  1080. return ScanError(p - 1, tkStrCon);
  1081. }
  1082. Error(ERRnoStrEnd);
  1083. }
  1084. break;
  1085. default:
  1086. LMainDefault:
  1087. if (this->IsMultiUnitChar(ch))
  1088. {
  1089. if ((ch == kchLS || ch == kchPS))
  1090. {
  1091. goto LEcmaLineBreak;
  1092. }
  1093. rawch = ch = this->template ReadRest<true>(ch, p, last);
  1094. switch (ch)
  1095. {
  1096. case kchLS: // 0x2028, classifies as new line
  1097. case kchPS: // 0x2029, classifies as new line
  1098. goto LEcmaLineBreak;
  1099. }
  1100. }
  1101. break;
  1102. case kchBSL:
  1103. // In raw mode '\\' is not an escape character, just add the char into the raw buffer.
  1104. m_tempChBufSecondary.template AppendCh<createRawString>(ch);
  1105. m_EscapeOnLastTkStrCon=TRUE;
  1106. // In raw mode, we append the raw char itself and not the escaped value so save the char.
  1107. rawch = ch = this->ReadFirst(p, last);
  1108. codepoint_t codePoint = 0;
  1109. uint errorType = (uint)ERRbadHexDigit;
  1110. switch (ch)
  1111. {
  1112. case 'b':
  1113. ch = 0x08;
  1114. break;
  1115. case 't':
  1116. ch = 0x09;
  1117. break;
  1118. case 'v':
  1119. ch = 0x0B; //Only in ES5 mode
  1120. break; //same as default
  1121. case 'n':
  1122. ch = 0x0A;
  1123. break;
  1124. case 'f':
  1125. ch = 0x0C;
  1126. break;
  1127. case 'r':
  1128. ch = 0x0D;
  1129. break;
  1130. case 'x':
  1131. // Insert the 'x' here before jumping to parse the hex digits.
  1132. m_tempChBufSecondary.template AppendCh<createRawString>(ch);
  1133. // 2 hex digits
  1134. ch = 0;
  1135. goto LTwoHex;
  1136. case 'u':
  1137. // Raw string just inserts a 'u' here.
  1138. m_tempChBufSecondary.template AppendCh<createRawString>(ch);
  1139. ch = 0;
  1140. if (Js::NumberUtilities::FHexDigit(c = this->ReadFirst(p, last), &wT))
  1141. goto LFourHex;
  1142. else if (c != '{' || !this->es6UnicodeMode)
  1143. goto ReturnScanError;
  1144. Assert(c == '{');
  1145. // c should definitely be a '{' which should be appended to the raw string.
  1146. m_tempChBufSecondary.template AppendCh<createRawString>(c);
  1147. //At least one digit is expected
  1148. if (!Js::NumberUtilities::FHexDigit(c = this->ReadFirst(p, last), &wT))
  1149. {
  1150. goto ReturnScanError;
  1151. }
  1152. m_tempChBufSecondary.template AppendCh<createRawString>(c);
  1153. codePoint = static_cast<codepoint_t>(wT);
  1154. while(Js::NumberUtilities::FHexDigit(c = this->ReadFirst(p, last), &wT))
  1155. {
  1156. m_tempChBufSecondary.template AppendCh<createRawString>(c);
  1157. codePoint <<= 4;
  1158. codePoint += static_cast<codepoint_t>(wT);
  1159. if (codePoint > 0x10FFFF)
  1160. {
  1161. errorType = (uint)ERRInvalidCodePoint;
  1162. goto ReturnScanError;
  1163. }
  1164. }
  1165. if (c != '}')
  1166. {
  1167. errorType = (uint)ERRMissingCurlyBrace;
  1168. goto ReturnScanError;
  1169. }
  1170. Assert(codePoint <= 0x10FFFF);
  1171. if (codePoint >= 0x10000)
  1172. {
  1173. OLECHAR lower = 0;
  1174. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &lower, &ch);
  1175. m_tempChBuf.AppendCh(lower);
  1176. }
  1177. else
  1178. {
  1179. ch = (char16)codePoint;
  1180. }
  1181. // In raw mode we want the last hex character or the closing curly. c should hold one or the other.
  1182. if (createRawString)
  1183. rawch = c;
  1184. break;
  1185. LFourHex:
  1186. codePoint = 0x0;
  1187. // Append first hex digit character to the raw string.
  1188. m_tempChBufSecondary.template AppendCh<createRawString>(c);
  1189. codePoint += static_cast<codepoint_t>(wT * 0x1000);
  1190. if (!Js::NumberUtilities::FHexDigit(c = this->ReadFirst(p, last), &wT))
  1191. goto ReturnScanError;
  1192. // Append fourth (or second) hex digit character to the raw string.
  1193. m_tempChBufSecondary.template AppendCh<createRawString>(c);
  1194. codePoint += static_cast<codepoint_t>(wT * 0x0100);
  1195. LTwoHex:
  1196. // This code path doesn't expect curly.
  1197. if (!Js::NumberUtilities::FHexDigit(c = this->ReadFirst(p, last), &wT))
  1198. goto ReturnScanError;
  1199. // Append first hex digit character to the raw string.
  1200. m_tempChBufSecondary.template AppendCh<createRawString>(c);
  1201. codePoint += static_cast<codepoint_t>(wT * 0x0010);
  1202. if (!Js::NumberUtilities::FHexDigit(c = this->ReadFirst(p, last), &wT))
  1203. goto ReturnScanError;
  1204. codePoint += static_cast<codepoint_t>(wT);
  1205. // In raw mode we want the last hex character or the closing curly. c should hold one or the other.
  1206. if (createRawString)
  1207. rawch = c;
  1208. if (codePoint < 0x10000)
  1209. {
  1210. ch = static_cast<OLECHAR>(codePoint);
  1211. }
  1212. else
  1213. {
  1214. goto ReturnScanError;
  1215. }
  1216. break;
  1217. case '0':
  1218. case '1':
  1219. case '2':
  1220. case '3':
  1221. // 1 to 3 octal digits
  1222. ch -= '0';
  1223. // Octal escape sequences are not allowed inside string template literals
  1224. if (stringTemplateMode)
  1225. {
  1226. c = this->PeekFirst(p, last);
  1227. if (ch != 0 || (c >= '0' && c <= '7'))
  1228. {
  1229. errorType = (uint)ERRES5NoOctal;
  1230. goto ReturnScanError;
  1231. }
  1232. break;
  1233. }
  1234. wT = (c = this->ReadFirst(p, last)) - '0';
  1235. if ((char16)wT > 7)
  1236. {
  1237. if (ch != 0 || ((char16)wT <= 9))
  1238. {
  1239. m_OctOrLeadingZeroOnLastTKNumber = true;
  1240. }
  1241. p--;
  1242. break;
  1243. }
  1244. m_OctOrLeadingZeroOnLastTKNumber = true;
  1245. ch = static_cast< OLECHAR >(ch * 8 + wT);
  1246. goto LOneOctal;
  1247. case '4':
  1248. case '5':
  1249. case '6':
  1250. case '7':
  1251. // 1 to 2 octal digits
  1252. // Octal escape sequences are not allowed inside string template literals
  1253. if (stringTemplateMode)
  1254. {
  1255. errorType = (uint)ERRES5NoOctal;
  1256. goto ReturnScanError;
  1257. }
  1258. ch -= '0';
  1259. m_OctOrLeadingZeroOnLastTKNumber = true;
  1260. LOneOctal:
  1261. wT = (c = this->ReadFirst(p, last)) - '0';
  1262. if ((char16)wT > 7)
  1263. {
  1264. p--;
  1265. break;
  1266. }
  1267. ch = static_cast< OLECHAR >(ch * 8 + wT);
  1268. break;
  1269. case kchRET: // 0xD
  1270. if (stringTemplateMode)
  1271. {
  1272. // If this is \<CR><LF> we can eat the <LF> right now
  1273. if (this->PeekFirst(p, last) == kchNWL)
  1274. {
  1275. // Eat the <LF> char, ignore return
  1276. this->ReadFirst(p, last);
  1277. }
  1278. // Both \<CR> and \<CR><LF> are normalized to \<LF> in template raw string
  1279. rawch = kchNWL;
  1280. }
  1281. case kchLS: // 0x2028, classifies as new line
  1282. case kchPS: // 0x2029, classifies as new line
  1283. case kchNWL: // 0xA
  1284. LEcmaEscapeLineBreak:
  1285. if (stringTemplateMode)
  1286. {
  1287. // We're going to ignore the line continuation tokens for the cooked strings, but we need to append the token for raw strings
  1288. m_tempChBufSecondary.template AppendCh<createRawString>(rawch);
  1289. // Template literal strings ignore all escaped line continuation tokens
  1290. NotifyScannedNewLine();
  1291. continue;
  1292. }
  1293. m_currentCharacter = p;
  1294. ScanNewLine(ch);
  1295. p = m_currentCharacter;
  1296. if (m_fSyntaxColor && *p == 0)
  1297. {
  1298. // Special case for multi-line strings during colorization.
  1299. m_scanState = delim == '"' ? ScanStateMultiLineDoubleQuoteString : ScanStateMultiLineSingleQuoteString;
  1300. *pp = p;
  1301. return tkStrCon;
  1302. }
  1303. continue;
  1304. case 0:
  1305. if (p >= last)
  1306. {
  1307. errorType = (uint)ERRnoStrEnd;
  1308. ReturnScanError:
  1309. m_currentCharacter = p - 1;
  1310. if (m_fSyntaxColor)
  1311. {
  1312. *pp = p - 1;
  1313. return ScanError(p - 1, tkStrCon);
  1314. }
  1315. Error(errorType);
  1316. }
  1317. else if (stringTemplateMode)
  1318. {
  1319. // Escaped null character is translated into 0x0030 for raw template literals
  1320. rawch = 0x0030;
  1321. }
  1322. break;
  1323. default:
  1324. if (this->IsMultiUnitChar(ch))
  1325. {
  1326. rawch = ch = this->template ReadRest<true>(ch, p, last);
  1327. switch (ch)
  1328. {
  1329. case kchLS:
  1330. case kchPS:
  1331. goto LEcmaEscapeLineBreak;
  1332. }
  1333. }
  1334. break;
  1335. }
  1336. break;
  1337. }
  1338. m_tempChBuf.AppendCh(ch);
  1339. m_tempChBufSecondary.template AppendCh<createRawString>(rawch);
  1340. }
  1341. LBreak:
  1342. bool createPid = true;
  1343. if (m_fSyntaxColor || (m_DeferredParseFlags & ScanFlagSuppressStrPid) != 0)
  1344. {
  1345. createPid = false;
  1346. if ((m_tempChBuf.m_ichCur == 10) && (0 == memcmp(_u("use strict"), m_tempChBuf.m_prgch, m_tempChBuf.m_ichCur * sizeof(OLECHAR))))
  1347. {
  1348. createPid = true;
  1349. }
  1350. }
  1351. if (createPid)
  1352. {
  1353. m_ptoken->SetIdentifier(m_phtbl->PidHashNameLen(m_tempChBuf.m_prgch, m_tempChBuf.m_ichCur));
  1354. }
  1355. else
  1356. {
  1357. m_ptoken->SetIdentifier(NULL);
  1358. }
  1359. m_scanState = ScanStateNormal;
  1360. m_doubleQuoteOnLastTkStrCon = '"' == delim;
  1361. *pp = p;
  1362. return tkStrCon;
  1363. }
  1364. template<typename EncodingPolicy>
  1365. tokens Scanner<EncodingPolicy>::ScanStringConstant(OLECHAR delim, EncodedCharPtr *pp)
  1366. {
  1367. return ScanStringConstant<false, false>(delim, pp);
  1368. }
  1369. /*****************************************************************************
  1370. *
  1371. * Consume a C-style comment.
  1372. */
  1373. template<typename EncodingPolicy>
  1374. tokens Scanner<EncodingPolicy>::SkipComment(EncodedCharPtr *pp, /* out */ bool* containTypeDef)
  1375. {
  1376. Assert(containTypeDef != nullptr);
  1377. EncodedCharPtr p = *pp;
  1378. *containTypeDef = false;
  1379. EncodedCharPtr last = m_pchLast;
  1380. OLECHAR ch;
  1381. for (;;)
  1382. {
  1383. switch((ch = this->ReadFirst(p, last)))
  1384. {
  1385. case '*':
  1386. if (*p == '/')
  1387. {
  1388. *pp = p + 1;
  1389. if (m_fSyntaxColor)
  1390. {
  1391. m_scanState = ScanStateNormal;
  1392. return tkComment;
  1393. }
  1394. return tkNone;
  1395. }
  1396. break;
  1397. // ES 2015 11.3 Line Terminators
  1398. case kchLS: // 0x2028, classifies as new line
  1399. case kchPS: // 0x2029, classifies as new line
  1400. LEcmaLineBreak:
  1401. goto LLineBreak;
  1402. case kchRET:
  1403. case kchNWL:
  1404. LLineBreak:
  1405. m_fHadEol = TRUE;
  1406. m_currentCharacter = p;
  1407. ScanNewLine(ch);
  1408. p = m_currentCharacter;
  1409. break;
  1410. case kchNUL:
  1411. if (p >= last)
  1412. {
  1413. m_currentCharacter = p - 1;
  1414. *pp = p - 1;
  1415. if (m_fSyntaxColor)
  1416. {
  1417. m_scanState = ScanStateMultiLineComment;
  1418. return tkComment;
  1419. }
  1420. Error(ERRnoCmtEnd);
  1421. }
  1422. break;
  1423. default:
  1424. if (this->IsMultiUnitChar(ch))
  1425. {
  1426. ch = this->template ReadRest<true>(ch, p, last);
  1427. switch (ch)
  1428. {
  1429. case kchLS:
  1430. case kchPS:
  1431. goto LEcmaLineBreak;
  1432. }
  1433. }
  1434. break;
  1435. }
  1436. }
  1437. }
  1438. /*****************************************************************************
  1439. *
  1440. * We've encountered a newline - update various counters and things.
  1441. */
  1442. template<typename EncodingPolicy>
  1443. void Scanner<EncodingPolicy>::ScanNewLine(uint ch)
  1444. {
  1445. if (ch == '\r' && PeekNextChar() == '\n')
  1446. {
  1447. ReadNextChar();
  1448. }
  1449. NotifyScannedNewLine();
  1450. }
  1451. /*****************************************************************************
  1452. *
  1453. * We've encountered a newline - update various counters and things.
  1454. */
  1455. template<typename EncodingPolicy>
  1456. void Scanner<EncodingPolicy>::NotifyScannedNewLine()
  1457. {
  1458. // update in scanner: previous line, current line, number of lines.
  1459. m_line++;
  1460. m_pchPrevLine = m_pchMinLine;
  1461. m_pchMinLine = m_currentCharacter;
  1462. m_cMinLineMultiUnits = this->m_cMultiUnits;
  1463. }
  1464. /*****************************************************************************
  1465. *
  1466. * Delivers a token stream.
  1467. */
  1468. template<typename EncodingPolicy>
  1469. tokens Scanner<EncodingPolicy>::ScanForcingPid()
  1470. {
  1471. if (m_DeferredParseFlags != ScanFlagNone)
  1472. {
  1473. BYTE deferredParseFlagsSave = m_DeferredParseFlags;
  1474. m_DeferredParseFlags = ScanFlagNone;
  1475. tokens result = tkEOF;
  1476. TryFinally(
  1477. [&]() /* try block */
  1478. {
  1479. result = this->Scan();
  1480. },
  1481. [&](bool) /* finally block */
  1482. {
  1483. this->m_DeferredParseFlags = deferredParseFlagsSave;
  1484. });
  1485. return result;
  1486. }
  1487. return Scan();
  1488. }
  1489. template<typename EncodingPolicy>
  1490. tokens Scanner<EncodingPolicy>::Scan()
  1491. {
  1492. return ScanCore(true);
  1493. }
  1494. template<typename EncodingPolicy>
  1495. tokens Scanner<EncodingPolicy>::ScanNoKeywords()
  1496. {
  1497. return ScanCore(false);
  1498. }
  1499. template<typename EncodingPolicy>
  1500. tokens Scanner<EncodingPolicy>::ScanAhead()
  1501. {
  1502. return ScanNoKeywords();
  1503. }
  1504. template<typename EncodingPolicy>
  1505. tokens Scanner<EncodingPolicy>::ScanCore(bool identifyKwds)
  1506. {
  1507. codepoint_t ch;
  1508. OLECHAR firstChar;
  1509. OLECHAR secondChar;
  1510. EncodedCharPtr pchT;
  1511. size_t multiUnits = 0;
  1512. EncodedCharPtr p = m_currentCharacter;
  1513. EncodedCharPtr last = m_pchLast;
  1514. bool seenDelimitedCommentEnd = false;
  1515. // store the last token
  1516. m_tkPrevious = m_ptoken->tk;
  1517. m_iecpLimTokPrevious = IecpLimTok(); // Introduced for use by lambda parsing to find correct span of expression lambdas
  1518. if (p >= last)
  1519. {
  1520. m_pchMinTok = p;
  1521. m_cMinTokMultiUnits = this->m_cMultiUnits;
  1522. goto LEof;
  1523. }
  1524. tokens token;
  1525. m_fHadEol = FALSE;
  1526. CharTypes chType;
  1527. charcount_t commentStartLine;
  1528. if (m_scanState && *p != 0)
  1529. {
  1530. if (m_fSyntaxColor)
  1531. {
  1532. firstChar = 0;
  1533. secondChar = 0;
  1534. m_pchMinTok = p;
  1535. m_cMinTokMultiUnits = this->m_cMultiUnits;
  1536. switch (m_scanState)
  1537. {
  1538. case ScanStateMultiLineComment:
  1539. goto LMultiLineComment;
  1540. case ScanStateMultiLineSingleQuoteString:
  1541. ch = '\'';
  1542. m_scanState = ScanStateNormal;
  1543. goto LScanStringConstant;
  1544. case ScanStateMultiLineDoubleQuoteString:
  1545. ch = '"';
  1546. m_scanState = ScanStateNormal;
  1547. goto LScanStringConstant;
  1548. }
  1549. }
  1550. if (m_scanState == ScanStateStringTemplateMiddleOrEnd)
  1551. {
  1552. AssertMsg(m_fStringTemplateDepth > 0,
  1553. "Shouldn't be trying to parse a string template end or middle token if we aren't scanning a string template");
  1554. m_scanState = ScanStateNormal;
  1555. pchT = p;
  1556. token = ScanStringTemplateMiddleOrEnd(&pchT);
  1557. p = pchT;
  1558. goto LDone;
  1559. }
  1560. }
  1561. for (;;)
  1562. {
  1563. LLoop:
  1564. m_pchMinTok = p;
  1565. m_cMinTokMultiUnits = this->m_cMultiUnits;
  1566. ch = this->ReadFirst(p, last);
  1567. #if DEBUG
  1568. chType = this->charClassifier->GetCharType((OLECHAR)ch);
  1569. #endif
  1570. switch (ch)
  1571. {
  1572. default:
  1573. if (ch == kchLS ||
  1574. ch == kchPS )
  1575. {
  1576. goto LNewLine;
  1577. }
  1578. {
  1579. BOOL isMultiUnit = this->IsMultiUnitChar((OLECHAR)ch);
  1580. if (isMultiUnit)
  1581. {
  1582. ch = this->template ReadRest<true>((OLECHAR)ch, p, last);
  1583. }
  1584. if (es6UnicodeMode && Js::NumberUtilities::IsSurrogateLowerPart(ch))
  1585. {
  1586. codepoint_t upper = this->PeekFull(p, last);
  1587. if (Js::NumberUtilities::IsSurrogateUpperPart(upper))
  1588. {
  1589. // Consume the rest of the utf8 bytes for the codepoint
  1590. OLECHAR decodedUpper = this->ReadSurrogatePairUpper(p, last);
  1591. Assert(decodedUpper == (OLECHAR) upper);
  1592. ch = Js::NumberUtilities::SurrogatePairAsCodePoint(ch, upper);
  1593. }
  1594. }
  1595. if (this->charClassifier->IsIdStart(ch))
  1596. {
  1597. // We treat IDContinue as an error.
  1598. token = ScanIdentifierContinue(identifyKwds, false, !!isMultiUnit, m_pchMinTok, p, &p);
  1599. break;
  1600. }
  1601. }
  1602. chType = this->charClassifier->GetCharType(ch);
  1603. switch (chType)
  1604. {
  1605. case _C_WSP: continue;
  1606. case _C_NWL: goto LNewLine;
  1607. // All other types (except errors) are handled by the outer switch.
  1608. }
  1609. Assert(chType == _C_LET || chType == _C_ERR || chType == _C_UNK || chType == _C_BKQ || chType == _C_SHP || chType == _C_AT || chType == _C_DIG);
  1610. if (m_fSyntaxColor)
  1611. {
  1612. // No need to decrement the current position pointer as scanner will continue with scan next character onwards
  1613. return ScanError(p, tkID);
  1614. }
  1615. m_currentCharacter = p - 1;
  1616. Error(ERRillegalChar);
  1617. continue;
  1618. case '\0':
  1619. // Put back the null in case we get called again.
  1620. p--;
  1621. LEof:
  1622. token = tkEOF;
  1623. if (p + 1 < last)
  1624. {
  1625. if (m_fSyntaxColor)
  1626. {
  1627. return ScanError(p + 1, tkID);
  1628. }
  1629. // A \0 prior to the end of the text is an invalid character.
  1630. Error(ERRillegalChar);
  1631. }
  1632. break;
  1633. case 0x0009:
  1634. case 0x000B:
  1635. case 0x000C:
  1636. case 0x0020:
  1637. Assert(chType == _C_WSP);
  1638. continue;
  1639. case '.':
  1640. if (!Js::NumberUtilities::IsDigit(*p))
  1641. {
  1642. // Not a double
  1643. if (m_scriptContext->GetConfig()->IsES6SpreadEnabled() &&
  1644. this->PeekFirst(p, last) == '.' &&
  1645. this->PeekFirst(p + 1, last) == '.')
  1646. {
  1647. token = tkEllipsis;
  1648. p += 2;
  1649. }
  1650. else
  1651. {
  1652. token = tkDot;
  1653. }
  1654. break;
  1655. }
  1656. // May be a double, fall through
  1657. case '0': case '1': case '2': case '3': case '4':
  1658. case '5': case '6': case '7': case '8': case '9':
  1659. {
  1660. double dbl;
  1661. Assert(chType == _C_DIG || chType == _C_DOT);
  1662. p = m_pchMinTok;
  1663. this->RestoreMultiUnits(m_cMinTokMultiUnits);
  1664. bool likelyInt = true;
  1665. pchT = FScanNumber(p, &dbl, likelyInt);
  1666. if (p == pchT)
  1667. {
  1668. Assert(this->PeekFirst(p, last) != '.');
  1669. if (m_fSyntaxColor)
  1670. {
  1671. return ScanError(m_currentCharacter + 1, tkFltCon);
  1672. }
  1673. Error(ERRbadNumber);
  1674. }
  1675. Assert(!Js::NumberUtilities::IsNan(dbl));
  1676. p = pchT;
  1677. int32 value;
  1678. if (likelyInt && Js::NumberUtilities::FDblIsInt32(dbl, &value))
  1679. {
  1680. m_ptoken->SetLong(value);
  1681. token = tkIntCon;
  1682. }
  1683. else
  1684. {
  1685. token = tkFltCon;
  1686. m_ptoken->SetDouble(dbl, likelyInt);
  1687. }
  1688. break;
  1689. }
  1690. case '(': Assert(chType == _C_LPR); token = tkLParen; break;
  1691. case ')': Assert(chType == _C_RPR); token = tkRParen; break;
  1692. case ',': Assert(chType == _C_CMA); token = tkComma; break;
  1693. case ';': Assert(chType == _C_SMC); token = tkSColon; break;
  1694. case '[': Assert(chType == _C_LBR); token = tkLBrack; break;
  1695. case ']': Assert(chType == _C_RBR); token = tkRBrack; break;
  1696. case '~': Assert(chType == _C_TIL); token = tkTilde; break;
  1697. case '?': Assert(chType == _C_QUE); token = tkQMark; break;
  1698. case '{': Assert(chType == _C_LC); token = tkLCurly; break;
  1699. // ES 2015 11.3 Line Terminators
  1700. case '\r':
  1701. case '\n':
  1702. // kchLS:
  1703. // kchPS:
  1704. LNewLine:
  1705. m_currentCharacter = p;
  1706. ScanNewLine(ch);
  1707. p = m_currentCharacter;
  1708. m_fHadEol = TRUE;
  1709. continue;
  1710. LReserved:
  1711. {
  1712. // We will derive the PID from the token
  1713. Assert(token < tkID);
  1714. m_ptoken->SetIdentifier(NULL);
  1715. goto LDone;
  1716. }
  1717. LEval:
  1718. {
  1719. token = tkID;
  1720. if (!this->m_parser) goto LIdentifier;
  1721. m_ptoken->SetIdentifier(this->m_parser->GetEvalPid());
  1722. goto LDone;
  1723. }
  1724. LArguments:
  1725. {
  1726. token = tkID;
  1727. if (!this->m_parser) goto LIdentifier;
  1728. m_ptoken->SetIdentifier(this->m_parser->GetArgumentsPid());
  1729. goto LDone;
  1730. }
  1731. LTarget:
  1732. {
  1733. token = tkID;
  1734. if (!this->m_parser) goto LIdentifier;
  1735. m_ptoken->SetIdentifier(this->m_parser->GetTargetPid());
  1736. goto LDone;
  1737. }
  1738. #include "kwd-swtch.h"
  1739. case 'A': case 'B': case 'C': case 'D': case 'E':
  1740. case 'F': case 'G': case 'H': case 'I': case 'J':
  1741. case 'K': case 'L': case 'M': case 'N': case 'O':
  1742. case 'P': case 'Q': case 'R': case 'S': case 'T':
  1743. case 'U': case 'V': case 'W': case 'X': case 'Y':
  1744. case 'Z':
  1745. // Lower-case letters handled in kwd-swtch.h above during reserved word recognition.
  1746. case '$': case '_':
  1747. LIdentifier:
  1748. Assert(this->charClassifier->IsIdStart(ch));
  1749. Assert(ch < 0x10000 && !this->IsMultiUnitChar((OLECHAR)ch));
  1750. token = ScanIdentifierContinue(identifyKwds, false, false, m_pchMinTok, p, &p);
  1751. break;
  1752. case '`':
  1753. Assert(chType == _C_BKQ);
  1754. pchT = p;
  1755. token = ScanStringTemplateBegin(&pchT);
  1756. p = pchT;
  1757. break;
  1758. case '}':
  1759. Assert(chType == _C_RC);
  1760. token = tkRCurly;
  1761. break;
  1762. case '\\':
  1763. pchT = p - 1;
  1764. token = ScanIdentifier(identifyKwds, &pchT);
  1765. if (tkScanError == token)
  1766. {
  1767. m_currentCharacter = p;
  1768. if (m_fSyntaxColor)
  1769. return ScanError(p, tkID);
  1770. Error(ERRillegalChar);
  1771. }
  1772. p = pchT;
  1773. break;
  1774. case ':':
  1775. token = tkColon;
  1776. break;
  1777. case '=':
  1778. token = tkAsg;
  1779. switch (this->PeekFirst(p, last))
  1780. {
  1781. case '=':
  1782. p++;
  1783. token = tkEQ;
  1784. if (this->PeekFirst(p, last) == '=')
  1785. {
  1786. p++;
  1787. token = tkEqv;
  1788. }
  1789. break;
  1790. case '>':
  1791. p++;
  1792. token = tkDArrow;
  1793. break;
  1794. }
  1795. break;
  1796. case '!':
  1797. token = tkBang;
  1798. if (this->PeekFirst(p, last) == '=')
  1799. {
  1800. p++;
  1801. token = tkNE;
  1802. if (this->PeekFirst(p, last) == '=')
  1803. {
  1804. p++;
  1805. token = tkNEqv;
  1806. }
  1807. }
  1808. break;
  1809. case '+':
  1810. token = tkAdd;
  1811. switch (this->PeekFirst(p, last))
  1812. {
  1813. case '=':
  1814. p++;
  1815. token = tkAsgAdd;
  1816. break;
  1817. case '+':
  1818. p++;
  1819. token = tkInc;
  1820. break;
  1821. }
  1822. break;
  1823. case '-':
  1824. token = tkSub;
  1825. switch (this->PeekFirst(p, last))
  1826. {
  1827. case '=':
  1828. p++;
  1829. token = tkAsgSub;
  1830. break;
  1831. case '-':
  1832. p++;
  1833. token = tkDec;
  1834. if (!m_fIsModuleCode)
  1835. {
  1836. if ('>' == this->PeekFirst(p, last) && (m_fHadEol || seenDelimitedCommentEnd)) // --> HTMLCloseComment
  1837. {
  1838. goto LSkipLineComment;
  1839. }
  1840. }
  1841. break;
  1842. }
  1843. break;
  1844. case '*':
  1845. token = tkStar;
  1846. switch(this->PeekFirst(p, last))
  1847. {
  1848. case '=' :
  1849. p++;
  1850. token = tkAsgMul;
  1851. break;
  1852. case '*' :
  1853. if (!m_scriptContext->GetConfig()->IsES7ExponentiationOperatorEnabled())
  1854. {
  1855. break;
  1856. }
  1857. p++;
  1858. token = tkExpo;
  1859. if (this->PeekFirst(p, last) == '=')
  1860. {
  1861. p++;
  1862. token = tkAsgExpo;
  1863. }
  1864. }
  1865. break;
  1866. case '/':
  1867. token = tkDiv;
  1868. switch(this->PeekFirst(p, last))
  1869. {
  1870. case '=':
  1871. p++;
  1872. token = tkAsgDiv;
  1873. break;
  1874. case '/':
  1875. if (p >= last)
  1876. {
  1877. AssertMsg(!m_fIsModuleCode, "Do we have other line comment cases scanning pass last?");
  1878. // Effective source length may have excluded HTMLCommentSuffix "//... -->". If we are scanning
  1879. // those, we have passed "last" already. Move back and return EOF.
  1880. p = last;
  1881. goto LEof;
  1882. }
  1883. ch = *++p;
  1884. firstChar = (OLECHAR)ch;
  1885. LSkipLineComment:
  1886. pchT = NULL;
  1887. for (;;)
  1888. {
  1889. switch ((ch = this->ReadFirst(p, last)))
  1890. {
  1891. case kchLS: // 0x2028, classifies as new line
  1892. case kchPS: // 0x2029, classifies as new line
  1893. LEcmaCommentLineBreak:
  1894. // kchPS and kchLS are more than one unit in UTF-8.
  1895. if (pchT)
  1896. {
  1897. // kchPS and kchLS are more than one unit in UTF-8.
  1898. p = pchT;
  1899. }
  1900. else
  1901. {
  1902. // But only a single code unit in UTF16
  1903. p--;
  1904. }
  1905. this->RestoreMultiUnits(multiUnits);
  1906. goto LCommentLineBreak;
  1907. case kchNWL:
  1908. case kchRET:
  1909. p--;
  1910. LCommentLineBreak:
  1911. if (m_fSyntaxColor)
  1912. {
  1913. token = tkComment;
  1914. goto LDone;
  1915. }
  1916. // Subtract the comment length from the total char count for the purpose
  1917. // of deciding whether to defer AST and byte code generation.
  1918. m_parser->ReduceDeferredScriptLength((ULONG)(p - m_pchMinTok));
  1919. break;
  1920. case kchNUL:
  1921. if (p >= last)
  1922. {
  1923. p--;
  1924. goto LCommentLineBreak;
  1925. }
  1926. continue;
  1927. default:
  1928. if (this->IsMultiUnitChar((OLECHAR)ch))
  1929. {
  1930. pchT = p - 1;
  1931. multiUnits = this->m_cMultiUnits;
  1932. switch (ch = this->template ReadRest<true>((OLECHAR)ch, p, last))
  1933. {
  1934. case kchLS:
  1935. case kchPS:
  1936. goto LEcmaCommentLineBreak;
  1937. }
  1938. }
  1939. continue;
  1940. }
  1941. break;
  1942. }
  1943. continue;
  1944. case '*':
  1945. ch = *++p;
  1946. firstChar = (OLECHAR)ch;
  1947. if ((p + 1) < last)
  1948. {
  1949. secondChar = (OLECHAR)(*(p + 1));
  1950. }
  1951. else
  1952. {
  1953. secondChar = '\0';
  1954. }
  1955. LMultiLineComment:
  1956. pchT = p;
  1957. commentStartLine = m_line;
  1958. bool containTypeDef;
  1959. if (tkNone == (token = SkipComment(&pchT, &containTypeDef)))
  1960. {
  1961. // Subtract the comment length from the total char count for the purpose
  1962. // of deciding whether to defer AST and byte code generation.
  1963. m_parser->ReduceDeferredScriptLength((ULONG)(pchT - m_pchMinTok));
  1964. p = pchT;
  1965. seenDelimitedCommentEnd = true;
  1966. goto LLoop;
  1967. }
  1968. p = pchT;
  1969. break;
  1970. }
  1971. break;
  1972. case '%':
  1973. Assert(chType == _C_PCT);
  1974. token = tkPct;
  1975. if (this->PeekFirst(p, last) == '=')
  1976. {
  1977. p++;
  1978. token = tkAsgMod;
  1979. }
  1980. break;
  1981. case '<':
  1982. Assert(chType == _C_LT);
  1983. token = tkLT;
  1984. switch (this->PeekFirst(p, last))
  1985. {
  1986. case '=':
  1987. p++;
  1988. token = tkLE;
  1989. break;
  1990. case '<':
  1991. p++;
  1992. token = tkLsh;
  1993. if (this->PeekFirst(p, last) == '=')
  1994. {
  1995. p++;
  1996. token = tkAsgLsh;
  1997. break;
  1998. }
  1999. break;
  2000. case '!':
  2001. // ES 2015 B.1.3 - HTML comments are only allowed when parsing non-module code.
  2002. if (!m_fIsModuleCode && this->PeekFirst(p + 1, last) == '-' && this->PeekFirst(p + 2, last) == '-')
  2003. {
  2004. // This is a "<!--" comment - treat as //
  2005. if (p >= last)
  2006. {
  2007. // Effective source length may have excluded HTMLCommentSuffix "<!-- ... -->". If we are scanning
  2008. // those, we have passed "last" already. Move back and return EOF.
  2009. p = last;
  2010. goto LEof;
  2011. }
  2012. firstChar = '!';
  2013. goto LSkipLineComment;
  2014. }
  2015. break;
  2016. }
  2017. break;
  2018. case '>':
  2019. Assert(chType == _C_GT);
  2020. token = tkGT;
  2021. switch (this->PeekFirst(p, last))
  2022. {
  2023. case '=':
  2024. p++;
  2025. token = tkGE;
  2026. break;
  2027. case '>':
  2028. p++;
  2029. token = tkRsh;
  2030. switch (this->PeekFirst(p, last))
  2031. {
  2032. case '=':
  2033. p++;
  2034. token = tkAsgRsh;
  2035. break;
  2036. case '>':
  2037. p++;
  2038. token = tkRs2;
  2039. if (*p == '=')
  2040. {
  2041. p++;
  2042. token = tkAsgRs2;
  2043. }
  2044. break;
  2045. }
  2046. break;
  2047. }
  2048. break;
  2049. case '^':
  2050. Assert(chType == _C_XOR);
  2051. token = tkXor;
  2052. if (this->PeekFirst(p, last) == '=')
  2053. {
  2054. p++;
  2055. token = tkAsgXor;
  2056. }
  2057. break;
  2058. case '|':
  2059. Assert(chType == _C_BAR);
  2060. token = tkOr;
  2061. switch (this->PeekFirst(p, last))
  2062. {
  2063. case '=':
  2064. p++;
  2065. token = tkAsgOr;
  2066. break;
  2067. case '|':
  2068. p++;
  2069. token = tkLogOr;
  2070. break;
  2071. }
  2072. break;
  2073. case '&':
  2074. Assert(chType == _C_AMP);
  2075. token = tkAnd;
  2076. switch (this->PeekFirst(p, last))
  2077. {
  2078. case '=':
  2079. p++;
  2080. token = tkAsgAnd;
  2081. break;
  2082. case '&':
  2083. p++;
  2084. token = tkLogAnd;
  2085. break;
  2086. }
  2087. break;
  2088. case '\'':
  2089. case '"':
  2090. Assert(chType == _C_QUO || chType == _C_APO);
  2091. LScanStringConstant:
  2092. pchT = p;
  2093. token = this->ScanStringConstant((OLECHAR)ch, &pchT);
  2094. p = pchT;
  2095. break;
  2096. }
  2097. break;
  2098. }
  2099. LDone:
  2100. m_currentCharacter = p;
  2101. return (m_ptoken->tk = token);
  2102. }
  2103. template <typename EncodingPolicy>
  2104. IdentPtr Scanner<EncodingPolicy>::GetSecondaryBufferAsPid()
  2105. {
  2106. bool createPid = true;
  2107. if (m_fSyntaxColor || (m_DeferredParseFlags & ScanFlagSuppressStrPid) != 0)
  2108. {
  2109. createPid = false;
  2110. }
  2111. if (createPid)
  2112. {
  2113. return m_phtbl->PidHashNameLen(m_tempChBufSecondary.m_prgch, m_tempChBufSecondary.m_ichCur);
  2114. }
  2115. else
  2116. {
  2117. return nullptr;
  2118. }
  2119. }
  2120. template <typename EncodingPolicy>
  2121. LPCOLESTR Scanner<EncodingPolicy>::StringFromLong(int32 lw)
  2122. {
  2123. _ltow_s(lw, m_tempChBuf.m_prgch, m_tempChBuf.m_cchMax, 10);
  2124. return m_tempChBuf.m_prgch;
  2125. }
  2126. template <typename EncodingPolicy>
  2127. IdentPtr Scanner<EncodingPolicy>::PidFromLong(int32 lw)
  2128. {
  2129. return m_phtbl->PidHashName(StringFromLong(lw));
  2130. }
  2131. template <typename EncodingPolicy>
  2132. LPCOLESTR Scanner<EncodingPolicy>::StringFromDbl(double dbl)
  2133. {
  2134. if (!Js::NumberUtilities::FDblToStr(dbl, m_tempChBuf.m_prgch, m_tempChBuf.m_cchMax))
  2135. {
  2136. Error(ERRnoMemory);
  2137. }
  2138. return m_tempChBuf.m_prgch;
  2139. }
  2140. template <typename EncodingPolicy>
  2141. IdentPtr Scanner<EncodingPolicy>::PidFromDbl(double dbl)
  2142. {
  2143. return m_phtbl->PidHashName(StringFromDbl(dbl));
  2144. }
  2145. template <typename EncodingPolicy>
  2146. void Scanner<EncodingPolicy>::Capture(_Out_ RestorePoint* restorePoint)
  2147. {
  2148. Capture(restorePoint, 0, 0);
  2149. }
  2150. template <typename EncodingPolicy>
  2151. void Scanner<EncodingPolicy>::Capture(_Out_ RestorePoint* restorePoint, uint functionIdIncrement, size_t lengthDecr)
  2152. {
  2153. restorePoint->m_ichMinTok = this->IchMinTok();
  2154. restorePoint->m_ichMinLine = this->IchMinLine();
  2155. restorePoint->m_cMinTokMultiUnits = this->m_cMinTokMultiUnits;
  2156. restorePoint->m_cMinLineMultiUnits = this->m_cMinLineMultiUnits;
  2157. restorePoint->m_line = this->m_line;
  2158. restorePoint->m_fHadEol = this->m_fHadEol;
  2159. restorePoint->functionIdIncrement = functionIdIncrement;
  2160. restorePoint->lengthDecr = lengthDecr;
  2161. #ifdef DEBUG
  2162. restorePoint->m_cMultiUnits = this->m_cMultiUnits;
  2163. #endif
  2164. }
  2165. template <typename EncodingPolicy>
  2166. void Scanner<EncodingPolicy>::SeekTo(const RestorePoint& restorePoint)
  2167. {
  2168. SeekAndScan<false>(restorePoint);
  2169. }
  2170. template <typename EncodingPolicy>
  2171. void Scanner<EncodingPolicy>::SeekToForcingPid(const RestorePoint& restorePoint)
  2172. {
  2173. SeekAndScan<true>(restorePoint);
  2174. }
  2175. template <typename EncodingPolicy>
  2176. template <bool forcePid>
  2177. void Scanner<EncodingPolicy>::SeekAndScan(const RestorePoint& restorePoint)
  2178. {
  2179. this->m_currentCharacter = this->m_pchBase + restorePoint.m_ichMinTok + restorePoint.m_cMinTokMultiUnits;
  2180. this->m_pchMinLine = this->m_pchBase + restorePoint.m_ichMinLine + restorePoint.m_cMinLineMultiUnits;
  2181. this->m_cMinLineMultiUnits = restorePoint.m_cMinLineMultiUnits;
  2182. this->RestoreMultiUnits(restorePoint.m_cMinTokMultiUnits);
  2183. if (forcePid)
  2184. {
  2185. this->ScanForcingPid();
  2186. }
  2187. else
  2188. {
  2189. this->Scan();
  2190. }
  2191. this->m_line = restorePoint.m_line;
  2192. this->m_fHadEol = restorePoint.m_fHadEol;
  2193. this->m_parser->ReduceDeferredScriptLength(restorePoint.lengthDecr);
  2194. Assert(this->m_cMultiUnits == restorePoint.m_cMultiUnits);
  2195. }
  2196. template <typename EncodingPolicy>
  2197. void Scanner<EncodingPolicy>::SeekTo(const RestorePoint& restorePoint, uint *nextFunctionId)
  2198. {
  2199. SeekTo(restorePoint);
  2200. *nextFunctionId += restorePoint.functionIdIncrement;
  2201. }
  2202. // Called by CompileScriptException::ProcessError to retrieve a BSTR for the line on which an error occurred.
  2203. template<typename EncodingPolicy>
  2204. HRESULT Scanner<EncodingPolicy>::SysAllocErrorLine(int32 ichMinLine, __out BSTR* pbstrLine)
  2205. {
  2206. if( !pbstrLine )
  2207. {
  2208. return E_POINTER;
  2209. }
  2210. // If we overflow the string, we have a serious problem...
  2211. if (ichMinLine < 0 || static_cast<size_t>(ichMinLine) > AdjustedLength() )
  2212. {
  2213. return E_UNEXPECTED;
  2214. }
  2215. typename EncodingPolicy::EncodedCharPtr pStart = static_cast<size_t>(ichMinLine) == IchMinLine() ? m_pchMinLine : m_pchBase + this->CharacterOffsetToUnitOffset(m_pchBase, m_currentCharacter, m_pchLast, ichMinLine);
  2216. // Determine the length by scanning for the next newline
  2217. charcount_t cch = LineLength(pStart, m_pchLast);
  2218. Assert(cch <= LONG_MAX);
  2219. typename EncodingPolicy::EncodedCharPtr pEnd = static_cast<size_t>(ichMinLine) == IchMinLine() ? m_pchMinLine + cch : m_pchBase + this->CharacterOffsetToUnitOffset(m_pchBase, m_currentCharacter, m_pchLast, cch);
  2220. *pbstrLine = SysAllocStringLen(NULL, cch);
  2221. if (!*pbstrLine)
  2222. {
  2223. return E_OUTOFMEMORY;
  2224. }
  2225. this->ConvertToUnicode(*pbstrLine, cch, pStart, pEnd);
  2226. return S_OK;
  2227. }
  2228. template class Scanner<NullTerminatedUnicodeEncodingPolicy>;
  2229. template class Scanner<NullTerminatedUTF8EncodingPolicy>;
  2230. template class Scanner<NotNullTerminatedUTF8EncodingPolicy>;