RegexParser.cpp 123 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. /*
  6. --------------------------------------------------------------------------------------------------------------------------------
  7. Original
  8. --------------------------------------------------------------------------------------------------------------------------------
  9. Pattern ::= Disjunction
  10. Disjunction ::= Alternative | Alternative '|' Disjunction
  11. Alternative ::= [empty] | Alternative Term
  12. Term ::= Assertion | Atom | Atom Quantifier
  13. Assertion ::= '^' | '$' | '\' 'b' | '\' 'B' | '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
  14. Quantifier ::= QuantifierPrefix | QuantifierPrefix '?'
  15. QuantifierPrefix ::= '*' | '+' | '?' | '{' DecimalDigits '}' | '{' DecimalDigits ',' '}' | '{' DecimalDigits ',' DecimalDigits '}'
  16. Atom ::= PatternCharacter | '.' | '\' AtomEscape | CharacterClass | '(' Disjunction ')' | '(' '?' ':' Disjunction ')'
  17. PatternCharacter ::= SourceCharacter but not any of { '^', '$', '\', '.', '*', '+', '?', '(', ')', '[', ']', '{', '}', '|' }
  18. AtomEscape ::= DecimalEscape | CharacterEscape | CharacterClassEscape
  19. CharacterEscape ::= ControlEscape | 'c' ControlLetter | HexEscapeSequence | UnicodeEscapeSequence | IdentityEscape
  20. ControlEscape ::= one of { 'f', 'n', 'r', 't', 'v' }
  21. ControlLetter ::= one of { 'a'..'z', 'A'..'Z' }
  22. IdentityEscape ::= (SourceCharacter but not IdentifierPart) | <ZWJ> | <ZWNJ>
  23. DecimalEscape ::= DecimalIntegerLiteral [lookahead not in DecimalDigit]
  24. CharacterClassEscape :: one of { 'd', 'D', 's', 'S', 'w', 'W' }
  25. CharacterClass ::= '[' [lookahead not in {'^'}] ClassRanges ']' | '[' '^' ClassRanges ']'
  26. ClassRanges ::= [empty] | NonemptyClassRanges
  27. NonemptyClassRanges ::= ClassAtom | ClassAtom NonemptyClassRangesNoDash | ClassAtom '-' ClassAtom ClassRanges
  28. NonemptyClassRangesNoDash ::= ClassAtom | ClassAtomNoDash NonemptyClassRangesNoDash | ClassAtomNoDash '-' ClassAtom ClassRanges
  29. ClassAtom ::= '-' | ClassAtomNoDash
  30. ClassAtomNoDash ::= SourceCharacter but not one of { '\', ']', '-' } | '\' ClassEscape
  31. ClassEscape ::= DecimalEscape | 'b' | CharacterEscape | CharacterClassEscape
  32. SourceCharacter ::= <unicode character>
  33. HexEscapeSequence ::= 'x' HexDigit HexDigit
  34. UnicodeEscapeSequence ::= 'u' HexDigit HexDigit HexDigit HexDigit | 'u' '{' HexDigits '}'
  35. HexDigit ::= one of { '0'..'9', 'a'..'f', 'A'..'F' }
  36. IdentifierStart ::= UnicodeLetter | '$' | '_' | '\' UnicodeEscapeSequence
  37. IdentifierPart ::= IdentifierStart | UnicodeCombiningMark | UnicodeDigit | UnicodeConnectorPunctuation | <ZWNJ> | <ZWJ>
  38. UnicodeLetter ::= <unicode Uppercase letter> | <unicode Lowercase letter> | <unicode Titlecase letter> | <unicode Modifier letter> | <unicode Other letter> | <unicode Letter number>
  39. UnicodeCombiningMark = <unicode Non-spacing mark> | <unicode combining spacing mark>
  40. UnicodeDigit ::= <unicode Decimal number>
  41. UnicodeConnectorPunctuation ::= <unicode Connector punctuation>
  42. DecimalIntegerLiteral ::= '0' | NonZeroDigit DecimalDigits?
  43. DecimalDigits ::= DecimalDigit | DecimalDigits DecimalDigit
  44. DecimalDigit ::= one of { '0'..'9' }
  45. NonZeroDigit ::= one of { '1'..'9' }
  46. ------------------
  47. Annex B Deviations
  48. ------------------
  49. 1. The assertions (?= ) and (?! ) are treated as though they have a surrounding non-capture group, and hence can be quantified.
  50. Other assertions are not quantifiable.
  51. QuantifiableAssertion [added] ::= '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
  52. Assertion [replaced] ::= '^' | '$' | '\' 'b' | '\' 'B' | QuantifiableAssertion
  53. Term ::= ... | QuantifiableAssertion Quantifier [added]
  54. --------------------------------------------------------------------------------------------------------------------------------
  55. Left factored
  56. --------------------------------------------------------------------------------------------------------------------------------
  57. Pattern ::= Disjunction
  58. Disjunction ::= Alternative ('|' Alternative)*
  59. FOLLOW(Disjunction) = { <eof>, ')' }
  60. Alternative ::= Term*
  61. FOLLOW(Alternative) = { <eof>, ')', '|' }
  62. Term ::= ( '^' | '$' | '\' 'b' | '\' 'B' | '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')' | Atom Quantifier? )
  63. | ( PatternCharacter | '.' | '\' AtomEscape | CharacterClass | '(' Disjunction ')' | '(' '?' ':' Disjunction ')' )
  64. ( '*' | '+' | '?' | '{' DecimalDigits (',' DecimalDigits?)? '}' ) '?'?
  65. PatternCharacter ::= <unicode character> but not any of { '^', '$', '\', '.', '*', '+', '?', '(', ')', '[', ']', '{', '}', '|' }
  66. AtomEscape ::= DecimalEscape | CharacterEscape | CharacterClassEscape
  67. CharacterEscape ::= ControlEscape | 'c' ControlLetter | HexEscapeSequence | UnicodeEscapeSequence | IdentityEscape
  68. ControlEscape ::= one of { 'f', 'n', 'r', 't', 'v' }
  69. ControlLetter ::= one of { 'a'..'z', 'A'..'Z' }
  70. IdentityEscape ::= <unicode character> but not <unicode Uppercase letter>, <unicode Lowercase letter>, <unicode Titlecase letter>, <unicode Modifier letter>, <unicode Other letter>, <unicode Letter number>, '$', '_', <unicode Non-spacing mark>, <unicode combining spacing mark>, <unicode Decimal number>, <unicode Connector punctuation>
  71. DecimalEscape ::= DecimalIntegerLiteral [lookahead not in DecimalDigit]
  72. CharacterClassEscape :: one of { 'd', 'D', 's', 'S', 'w', 'W' }
  73. CharacterClass ::= '[' [lookahead not in {'^'}] ClassRanges ']' | '[' '^' ClassRanges ']'
  74. ClassRanges ::= [empty] | NonemptyClassRanges
  75. NonemptyClassRanges ::= ClassAtom | ClassAtom NonemptyClassRangesNoDash | ClassAtom '-' ClassAtom ClassRanges
  76. NonemptyClassRangesNoDash ::= ClassAtom | ClassAtomNoDash NonemptyClassRangesNoDash | ClassAtomNoDash '-' ClassAtom ClassRanges
  77. ClassAtom ::= '-' | ClassAtomNoDash
  78. ClassAtomNoDash ::= SourceCharacter but not one of { '\', ']', '-' } | '\' ClassEscape
  79. ClassEscape ::= DecimalEscape | 'b' | CharacterEscape | CharacterClassEscape
  80. HexEscapeSequence ::= 'x' HexDigit{2}
  81. UnicodeEscapeSequence ::= 'u' HexDigit{4} | 'u' '{' HexDigits{4-6} '}'
  82. HexDigit ::= one of { '0'..'9', 'a'..'f', 'A'..'F' }
  83. DecimalIntegerLiteral ::= '0' | NonZeroDigit DecimalDigits?
  84. DecimalDigits ::= DecimalDigit+
  85. DecimalDigit ::= one of { '0'..'9' }
  86. NonZeroDigit ::= one of { '1'..'9' }
  87. ------------------
  88. Annex B Deviations
  89. ------------------
  90. QuantifiableAssertion [added] ::= '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
  91. Term ::= ... | '(' '?' '=' Disjunction ')' [removed] | '(' '?' '!' Disjunction ')' [removed] | QuantifiableAssertion Quantifier? [added]
  92. */
  93. #include "ParserPch.h"
  94. namespace UnifiedRegex
  95. {
  96. ParseError::ParseError(bool isBody, CharCount pos, CharCount encodedPos, HRESULT error)
  97. : isBody(isBody), pos(pos), encodedPos(encodedPos), error(error)
  98. {
  99. }
  100. template <typename P, const bool IsLiteral>
  101. Parser<P, IsLiteral>::Parser
  102. ( Js::ScriptContext* scriptContext
  103. , ArenaAllocator* ctAllocator
  104. , StandardChars<EncodedChar>* standardEncodedChars
  105. , StandardChars<Char>* standardChars
  106. , bool isFromExternalSource
  107. #if ENABLE_REGEX_CONFIG_OPTIONS
  108. , DebugWriter* w
  109. #endif
  110. )
  111. : scriptContext(scriptContext)
  112. , ctAllocator(ctAllocator)
  113. , standardEncodedChars(standardEncodedChars)
  114. , standardChars(standardChars)
  115. #if ENABLE_REGEX_CONFIG_OPTIONS
  116. , w(w)
  117. #endif
  118. , input(0)
  119. , inputLim(0)
  120. , next(0)
  121. , inBody(false)
  122. , numGroups(1)
  123. , nextGroupId(1) // implicit overall group always takes index 0
  124. , litbuf(0)
  125. , litbufLen(0)
  126. , litbufNext(0)
  127. , surrogatePairList(nullptr)
  128. , currentSurrogatePairNode(nullptr)
  129. , tempLocationOfSurrogatePair(nullptr)
  130. , tempLocationOfRange(nullptr)
  131. , codePointAtTempLocation(0)
  132. , unicodeFlagPresent(false)
  133. , caseInsensitiveFlagPresent(false)
  134. , positionAfterLastSurrogate(nullptr)
  135. , valueOfLastSurrogate(INVALID_CODEPOINT)
  136. , deferredIfNotUnicodeError(nullptr)
  137. , deferredIfUnicodeError(nullptr)
  138. {
  139. if (isFromExternalSource)
  140. FromExternalSource();
  141. }
  142. //
  143. // Input buffer management
  144. //
  145. template <typename P, const bool IsLiteral>
  146. void Parser<P, IsLiteral>::SetPosition(const EncodedChar* input, const EncodedChar* inputLim, bool inBody)
  147. {
  148. this->input = input;
  149. this->inputLim = inputLim;
  150. next = input;
  151. this->inBody = inBody;
  152. RestoreMultiUnits(0);
  153. }
  154. template <typename P, const bool IsLiteral>
  155. inline CharCount Parser<P, IsLiteral>::Pos()
  156. {
  157. CharCount nextOffset = Chars<EncodedChar>::OSB(next, input);
  158. Assert(nextOffset >= m_cMultiUnits);
  159. return nextOffset - (CharCount)m_cMultiUnits;
  160. }
  161. template <typename P, const bool IsLiteral>
  162. inline bool Parser<P, IsLiteral>::IsEOF()
  163. {
  164. return next >= inputLim;
  165. }
  166. template <typename P, const bool IsLiteral>
  167. inline bool Parser<P, IsLiteral>::ECCanConsume(CharCount n = 1)
  168. {
  169. return next + n <= inputLim;
  170. }
  171. template <typename P, const bool IsLiteral>
  172. inline typename P::EncodedChar Parser<P, IsLiteral>::ECLookahead(CharCount n = 0)
  173. {
  174. // Ok to look ahead to terminating 0
  175. Assert(next + n <= inputLim);
  176. return next[n];
  177. }
  178. template <typename P, const bool IsLiteral>
  179. inline typename P::EncodedChar Parser<P, IsLiteral>::ECLookback(CharCount n = 0)
  180. {
  181. // Ok to look ahead to terminating 0
  182. Assert(n + input <= next);
  183. return *(next - n);
  184. }
  185. template <typename P, const bool IsLiteral>
  186. inline void Parser<P, IsLiteral>::ECConsume(CharCount n = 1)
  187. {
  188. Assert(next + n <= inputLim);
  189. #if DBG
  190. for (CharCount i = 0; i < n; i++)
  191. Assert(!IsMultiUnitChar(next[i]));
  192. #endif
  193. next += n;
  194. }
  195. template <typename P, const bool IsLiteral>
  196. inline void Parser<P, IsLiteral>::ECConsumeMultiUnit(CharCount n = 1)
  197. {
  198. Assert(next + n <= inputLim);
  199. next += n;
  200. }
  201. template <typename P, const bool IsLiteral>
  202. inline void Parser<P, IsLiteral>::ECRevert(CharCount n = 1)
  203. {
  204. Assert(n + input <= next);
  205. next -= n;
  206. }
  207. //
  208. // Helpers
  209. //
  210. template <typename P, const bool IsLiteral>
  211. int Parser<P, IsLiteral>::TryParseExtendedUnicodeEscape(Char& c, bool& previousSurrogatePart, bool trackSurrogatePair = false)
  212. {
  213. if (!scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  214. {
  215. return 0;
  216. }
  217. if (!ECCanConsume(2) || ECLookahead(0) !='{' || !standardEncodedChars->IsHex(ECLookahead(1)))
  218. {
  219. return 0;
  220. }
  221. // The first character is mandatory to consume escape sequence, so we check for it above, at this stage we can set it as we already checked.
  222. codepoint_t codePoint = standardEncodedChars->DigitValue(ECLookahead(1));
  223. int i = 2;
  224. while(ECCanConsume(i + 1) && standardEncodedChars->IsHex(ECLookahead(i)))
  225. {
  226. codePoint <<= 4;
  227. codePoint += standardEncodedChars->DigitValue(ECLookahead(i));
  228. if (codePoint > 0x10FFFF)
  229. {
  230. return 0;
  231. }
  232. i++;
  233. }
  234. if(!ECCanConsume(i + 1) || ECLookahead(i) != '}')
  235. {
  236. return 0;
  237. }
  238. uint consumptionNumber = i + 1;
  239. Assert(consumptionNumber >= 3);
  240. if (!previousSurrogatePart && trackSurrogatePair && this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  241. {
  242. // Current location
  243. TrackIfSurrogatePair(codePoint, (next - 1), consumptionNumber + 1);
  244. }
  245. char16 other;
  246. // Generally if this code point is a single character, then we take it and return.
  247. // If the character is made up of two characters then we emit the first and backtrack to the start of th escape sequence;
  248. // Following that we check if we have already seen the first character, and if so emit the second and consume the entire escape sequence.
  249. if (codePoint < 0x10000)
  250. {
  251. c = UTC(codePoint);
  252. ECConsumeMultiUnit(consumptionNumber);
  253. }
  254. else if (previousSurrogatePart)
  255. {
  256. previousSurrogatePart = false;
  257. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &other, &c);
  258. ECConsumeMultiUnit(consumptionNumber);
  259. }
  260. else
  261. {
  262. previousSurrogatePart = true;
  263. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &c, &other);
  264. Assert(ECLookback(1) == 'u' && ECLookback(2) == '\\');
  265. ECRevert(2);
  266. }
  267. return consumptionNumber;
  268. }
  269. // This function has the following 'knowledge':
  270. // - A codepoint that might be part of a surrogate pair or part of one
  271. // - A location where that codepoint is located
  272. // - Previously tracked part of surrogate pair (tempLocationOfSurrogatePair, and codePointAtTempLocation), which is always a surrogate lower part.
  273. // - A pointer to the current location of the linked list
  274. // - If a previous location is tracked, then it is of a parsed character (not given character) before current, and not same to current
  275. // This can't be asserted directly, and has to be followed by callers. Term pass can reset with each iteration, as well as this method in cases it needs.
  276. template <typename P, const bool IsLiteral>
  277. void Parser<P, IsLiteral>:: TrackIfSurrogatePair(codepoint_t codePoint, const EncodedChar* location, uint32 consumptionLength)
  278. {
  279. Assert(codePoint < 0x110000);
  280. Assert(location != nullptr);
  281. Assert(location != this->tempLocationOfSurrogatePair);
  282. if (Js::NumberUtilities::IsSurrogateLowerPart(codePoint))
  283. {
  284. this->tempLocationOfSurrogatePair = location;
  285. this->codePointAtTempLocation = codePoint;
  286. }
  287. else
  288. {
  289. if(Js::NumberUtilities::IsSurrogateUpperPart(codePoint) && this->tempLocationOfSurrogatePair != nullptr)
  290. {
  291. Assert(Js::NumberUtilities::IsSurrogateLowerPart(codePointAtTempLocation));
  292. consumptionLength = (uint32)(location - this->tempLocationOfSurrogatePair) + consumptionLength;
  293. codePoint = Js::NumberUtilities::SurrogatePairAsCodePoint(codePointAtTempLocation, codePoint);
  294. location = this->tempLocationOfSurrogatePair;
  295. }
  296. // At this point we can clear previous location, and then if codePoint is bigger than 0xFFFF store it, as we either received it or combined it above
  297. this->tempLocationOfSurrogatePair = nullptr;
  298. this->codePointAtTempLocation = 0;
  299. }
  300. if (codePoint > 0xFFFF)
  301. {
  302. this->positionAfterLastSurrogate = location + consumptionLength;
  303. this->valueOfLastSurrogate = codePoint;
  304. // When parsing without AST we aren't given an allocator. In addition, only the 2 lines above are used during Pass 0;
  305. // while the bottom is used during Pass 1 (which isn't done when ParseNoAST)
  306. if(this->ctAllocator != nullptr)
  307. {
  308. SurrogatePairTracker* node = Anew(this->ctAllocator, SurrogatePairTracker, location, this->tempLocationOfRange, codePoint, consumptionLength, m_cMultiUnits);
  309. if (surrogatePairList == nullptr)
  310. {
  311. Assert(currentSurrogatePairNode == nullptr);
  312. surrogatePairList = node;
  313. currentSurrogatePairNode = node;
  314. }
  315. else
  316. {
  317. Assert(currentSurrogatePairNode != nullptr);
  318. currentSurrogatePairNode->next = node;
  319. currentSurrogatePairNode = node;
  320. }
  321. }
  322. }
  323. }
  324. template <typename P, const bool IsLiteral>
  325. Node* Parser<P, IsLiteral>::CreateSurrogatePairAtom(char16 lower, char16 upper)
  326. {
  327. MatchLiteralNode * literalNode = Anew(this->ctAllocator, MatchLiteralNode, 0, 0);
  328. MatchCharNode lowerNode(lower);
  329. MatchCharNode upperNode(upper);
  330. AccumLiteral(literalNode, &lowerNode);
  331. AccumLiteral(literalNode, &upperNode);
  332. return literalNode;
  333. }
  334. // This function will create appropriate pairs of ranges and add them to the disjunction node.
  335. // Terms used in comments:
  336. // - A minor codePoint is smaller than major codePoint, and both define a range of codePoints above 0x10000; to avoid confusion between lower/upper denoting codeUnits composing the surrogate pair.
  337. // - A boundary is a mod 0x400 alignment marker due to the nature of surrogate pairs representation. So the codepoint 0x10300 lies between boundaries 0x10000 and 0x10400.
  338. // - A prefix is the range set used to represent the values from minorCodePoint to the first boundary above minorCodePoint if applicable.
  339. // - A suffix is the range set used to represent the values from first boundary below majorCodePoint to the majorCodePoint if applicable.
  340. // - A full range is the range set used to represent the values from first boundary above minorCodePoint to first boundary below majorCodePoint if applicable.
  341. // The algorithm works as follows:
  342. // 1. Determine minorBoundary (minorCodePoint - mod 0x400 +0x400) and majorBoundary (majorCodePoint - mod 0x400). minorBoundary > minorCodePoint and majorBoundary < majorCodePoint
  343. // 2. Based on the codePoints and the boundaries, prefix, suffix, and full range is determined. Here are the rules:
  344. // 2-a. If minorBoundary > majorBoundary, we have an inner boundary range, output just that.
  345. // 2-b. If minorBoundary - 0x400u != minorCodepoint (i.e. codePoint doesn't lie right on a boundary to be part of a full range) we have a prefix.
  346. // 2-c. If majorBoundary + 0x3FFu != majorCodepoint (i.e. codePoint doesn't lie right before a boundary to be part of a full range) we have a suffix.
  347. // 2-d. We have a full range, if the two boundaries don't equal, OR the codePoints lie on the range boundaries opposite to what constitutes a prefix/suffix.
  348. // Visual representation for sample range 0x10300 - 0x10900
  349. // | [ _ | ^ ] |
  350. // 0x10000 0x10800 0x11000
  351. // [ ] - denote the actual range
  352. // _ - minorBoundary
  353. // ^ - majorBoundary
  354. // | - other boundaries
  355. // prefix is between [ and _
  356. // suffix is between ^ and ]
  357. // full range is between _ and ^
  358. template <typename P, const bool IsLiteral>
  359. AltNode* Parser<P, IsLiteral>::AppendSurrogateRangeToDisjunction(codepoint_t minorCodePoint, codepoint_t majorCodePoint, AltNode *lastAltNode)
  360. {
  361. Assert(minorCodePoint < majorCodePoint);
  362. Assert(minorCodePoint >= 0x10000u);
  363. Assert(majorCodePoint >= 0x10000u);
  364. Assert(scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() && unicodeFlagPresent);
  365. char16 lowerMinorCodeUnit, upperMinorCodeUnit, lowerMajorCodeUnit, upperMajorCodeUnit;
  366. Js::NumberUtilities::CodePointAsSurrogatePair(minorCodePoint, &lowerMinorCodeUnit, &upperMinorCodeUnit);
  367. Js::NumberUtilities::CodePointAsSurrogatePair(majorCodePoint, &lowerMajorCodeUnit, &upperMajorCodeUnit);
  368. // These boundaries represent whole range boundaries, as in 0x10000, 0x10400, 0x10800 etc
  369. // minor boundary is the first boundary strictly above minorCodePoint
  370. // major boundary is the first boundary below or equal to majorCodePoint
  371. codepoint_t minorBoundary = minorCodePoint - (minorCodePoint % 0x400u) + 0x400u;
  372. codepoint_t majorBoundary = majorCodePoint - (majorCodePoint % 0x400u);
  373. Assert(minorBoundary >= 0x10000);
  374. Assert(majorBoundary >= 0x10000);
  375. AltNode* tailToAdd = nullptr;
  376. // If the minor boundary is higher than major boundary, that means we have a range within the boundary and is less than 0x400
  377. // Ex: 0x10430 - 0x10700 will have minor boundary of 0x10800 and major of 0x10400
  378. // This pair will be represented in single range set.
  379. const bool singleRange = minorBoundary > majorBoundary;
  380. if (singleRange)
  381. {
  382. Assert(majorCodePoint - minorCodePoint < 0x400u);
  383. Assert(lowerMinorCodeUnit == lowerMajorCodeUnit);
  384. MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, lowerMinorCodeUnit);
  385. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
  386. setNode->set.SetRange(ctAllocator, (Char)upperMinorCodeUnit, (Char)upperMajorCodeUnit);
  387. ConcatNode* concatNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, setNode, nullptr));
  388. tailToAdd = Anew(ctAllocator, AltNode, concatNode, nullptr);
  389. }
  390. else
  391. {
  392. Node* prefixNode = nullptr, *suffixNode = nullptr;
  393. const bool twoConsecutiveRanges = minorBoundary == majorBoundary;
  394. // For minorBoundary,
  395. if (minorBoundary - minorCodePoint == 1) // Single character in minor range
  396. {
  397. // The prefix is only a surrogate pair atom
  398. prefixNode = CreateSurrogatePairAtom(lowerMinorCodeUnit, upperMinorCodeUnit);
  399. }
  400. else if (minorCodePoint != minorBoundary - 0x400u) // Minor range isn't full
  401. {
  402. Assert(minorBoundary - minorCodePoint < 0x400u);
  403. MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, (Char)lowerMinorCodeUnit);
  404. MatchSetNode* upperSetNode = Anew(ctAllocator, MatchSetNode, false);
  405. upperSetNode->set.SetRange(ctAllocator, (Char)upperMinorCodeUnit, (Char)0xDFFFu);
  406. prefixNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, upperSetNode, nullptr));
  407. }
  408. else // Full minor range
  409. {
  410. minorBoundary -= 0x400u;
  411. }
  412. if (majorBoundary == majorCodePoint) // Single character in major range
  413. {
  414. // The suffix is only a surrogate pair atom
  415. suffixNode = CreateSurrogatePairAtom(lowerMajorCodeUnit, upperMajorCodeUnit);
  416. majorBoundary -= 0x400u;
  417. }
  418. else if (majorBoundary + 0x3FFu != majorCodePoint) // Major range isn't full
  419. {
  420. Assert(majorCodePoint - majorBoundary < 0x3FFu);
  421. MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, (Char)lowerMajorCodeUnit);
  422. MatchSetNode* upperSetNode = Anew(ctAllocator, MatchSetNode, false, false);
  423. upperSetNode->set.SetRange(ctAllocator, (Char)0xDC00u, (Char)upperMajorCodeUnit);
  424. suffixNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, upperSetNode, nullptr));
  425. majorBoundary -= 0x400u;
  426. }
  427. const bool nonFullConsecutiveRanges = twoConsecutiveRanges && prefixNode != nullptr && suffixNode != nullptr;
  428. if (nonFullConsecutiveRanges)
  429. {
  430. Assert(suffixNode != nullptr);
  431. Assert(minorCodePoint != minorBoundary - 0x400u);
  432. Assert(majorBoundary + 0x3FFu != majorCodePoint);
  433. // If the minor boundary is equal to major boundary, that means we have a cross boundary range that only needs 2 nodes for prefix/suffix.
  434. // We can only cross one boundary.
  435. Assert(majorCodePoint - minorCodePoint < 0x800u);
  436. tailToAdd = Anew(ctAllocator, AltNode, prefixNode, Anew(ctAllocator, AltNode, suffixNode, nullptr));
  437. }
  438. else
  439. {
  440. // We have 3 sets of ranges, comprising of prefix, full and suffix.
  441. Assert(majorCodePoint - minorCodePoint >= 0x400u);
  442. Assert((prefixNode != nullptr && suffixNode != nullptr) // Spanning more than two ranges
  443. || (prefixNode == nullptr && minorBoundary == minorCodePoint) // Two consecutive ranges and the minor is full
  444. || (suffixNode == nullptr && majorBoundary + 0x3FFu == majorCodePoint)); // Two consecutive ranges and the major is full
  445. Node* lowerOfFullRange;
  446. char16 lowerMinorBoundary, lowerMajorBoundary, ignore;
  447. Js::NumberUtilities::CodePointAsSurrogatePair(minorBoundary, &lowerMinorBoundary, &ignore);
  448. bool singleFullRange = majorBoundary == minorBoundary;
  449. if (singleFullRange)
  450. {
  451. // The lower part of the full range is simple a surrogate lower char
  452. lowerOfFullRange = Anew(ctAllocator, MatchCharNode, (Char)lowerMinorBoundary);
  453. }
  454. else
  455. {
  456. Js::NumberUtilities::CodePointAsSurrogatePair(majorBoundary, &lowerMajorBoundary, &ignore);
  457. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
  458. setNode->set.SetRange(ctAllocator, (Char)lowerMinorBoundary, (Char)lowerMajorBoundary);
  459. lowerOfFullRange = setNode;
  460. }
  461. MatchSetNode* fullUpperRange = Anew(ctAllocator, MatchSetNode, false, false);
  462. fullUpperRange->set.SetRange(ctAllocator, (Char)0xDC00u, (Char)0xDFFFu);
  463. // These are added in the following order [full] [prefix][suffix]
  464. // This is doing by prepending, so in reverse.
  465. if (suffixNode != nullptr)
  466. {
  467. tailToAdd = Anew(ctAllocator, AltNode, suffixNode, tailToAdd);
  468. }
  469. if (prefixNode != nullptr)
  470. {
  471. tailToAdd = Anew(ctAllocator, AltNode, prefixNode, tailToAdd);
  472. }
  473. tailToAdd = Anew(ctAllocator, AltNode, Anew(ctAllocator, ConcatNode, lowerOfFullRange, Anew(ctAllocator, ConcatNode, fullUpperRange, nullptr)), tailToAdd);
  474. }
  475. }
  476. if (lastAltNode != nullptr)
  477. {
  478. Assert(lastAltNode->tail == nullptr);
  479. lastAltNode->tail = tailToAdd;
  480. }
  481. return tailToAdd;
  482. }
  483. template <typename P, const bool IsLiteral>
  484. AltNode* Parser<P, IsLiteral>::AppendSurrogatePairToDisjunction(codepoint_t codePoint, AltNode *lastAltNode)
  485. {
  486. char16 lower, upper;
  487. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &lower, &upper);
  488. AltNode* tailNode = Anew(ctAllocator, AltNode, CreateSurrogatePairAtom(lower, upper), nullptr);
  489. if (lastAltNode != nullptr)
  490. {
  491. lastAltNode->tail = tailNode;
  492. }
  493. return tailNode;
  494. }
  495. //
  496. // Errors
  497. //
  498. template <typename P, const bool IsLiteral>
  499. void Parser<P, IsLiteral>::Fail(HRESULT error)
  500. {
  501. throw ParseError(inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
  502. }
  503. // This doesn't throw, but stores first error code for throwing later
  504. template <typename P, const bool IsLiteral>
  505. void Parser<P, IsLiteral>::DeferredFailIfUnicode(HRESULT error)
  506. {
  507. Assert(this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  508. if (this->deferredIfUnicodeError == nullptr)
  509. {
  510. this->deferredIfUnicodeError = Anew(ctAllocator, ParseError, inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
  511. }
  512. }
  513. // This doesn't throw, but stores first error code for throwing later
  514. template <typename P, const bool IsLiteral>
  515. void Parser<P, IsLiteral>::DeferredFailIfNotUnicode(HRESULT error)
  516. {
  517. Assert(this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  518. if (this->deferredIfNotUnicodeError == nullptr)
  519. {
  520. this->deferredIfNotUnicodeError = Anew(ctAllocator, ParseError, inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
  521. }
  522. }
  523. template <typename P, const bool IsLiteral>
  524. inline void Parser<P, IsLiteral>::ECMust(EncodedChar ec, HRESULT error)
  525. {
  526. // We never look for 0
  527. Assert(ec != 0);
  528. if (ECLookahead() != ec)
  529. Fail(error);
  530. ECConsume();
  531. }
  532. template <typename P, const bool IsLiteral>
  533. inline char16 Parser<P, IsLiteral>::NextChar()
  534. {
  535. Assert(!IsEOF());
  536. // Could be an embedded 0
  537. Char c = ReadFull<true>(next, inputLim);
  538. // No embedded newlines in literals
  539. if (IsLiteral && standardChars->IsNewline(c))
  540. Fail(ERRnoSlash);
  541. return c;
  542. }
  543. //
  544. // Patterns/Disjunctions/Alternatives
  545. //
  546. template <typename P, const bool IsLiteral>
  547. void Parser<P, IsLiteral>::PatternPass0()
  548. {
  549. this->positionAfterLastSurrogate = nullptr;
  550. this->deferredIfNotUnicodeError = nullptr;
  551. this->deferredIfUnicodeError = nullptr;
  552. DisjunctionPass0(0);
  553. }
  554. template <typename P, const bool IsLiteral>
  555. Node* Parser<P, IsLiteral>::PatternPass1()
  556. {
  557. return DisjunctionPass1();
  558. }
  559. template <typename P, const bool IsLiteral>
  560. Node* Parser<P, IsLiteral>::UnionNodes(Node* prev, Node* curr)
  561. {
  562. if (prev->tag == Node::MatchChar)
  563. {
  564. MatchCharNode* prevChar = (MatchCharNode*)prev;
  565. if (curr->tag == Node::MatchChar)
  566. {
  567. MatchCharNode* currChar = (MatchCharNode*)curr;
  568. if (prevChar->cs[0] == currChar->cs[0])
  569. // Just ignore current node
  570. return prevChar;
  571. else
  572. {
  573. // Union chars into new set
  574. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false);
  575. setNode->set.Set(ctAllocator, prevChar->cs[0]);
  576. setNode->set.Set(ctAllocator, currChar->cs[0]);
  577. return setNode;
  578. }
  579. }
  580. else if (curr->tag == Node::MatchSet)
  581. {
  582. MatchSetNode* currSet = (MatchSetNode*)curr;
  583. if (currSet->isNegation)
  584. // Can't merge
  585. return 0;
  586. else
  587. {
  588. // Union chars into new set
  589. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false);
  590. setNode->set.Set(ctAllocator, prevChar->cs[0]) ;
  591. setNode->set.UnionInPlace(ctAllocator, currSet->set);
  592. return setNode;
  593. }
  594. }
  595. else
  596. // Can't merge
  597. return 0;
  598. }
  599. else if (prev->tag == Node::MatchSet)
  600. {
  601. MatchSetNode* prevSet = (MatchSetNode*)prev;
  602. if (prevSet->isNegation)
  603. // Can't merge
  604. return 0;
  605. else if (curr->tag == Node::MatchChar)
  606. {
  607. MatchCharNode* currChar = (MatchCharNode*)curr;
  608. // Include char in prev set
  609. prevSet->set.Set(ctAllocator, currChar->cs[0]);
  610. return prevSet;
  611. }
  612. else if (curr->tag == Node::MatchSet)
  613. {
  614. MatchSetNode* currSet = (MatchSetNode*)curr;
  615. if (currSet->isNegation)
  616. // Can't merge
  617. return 0;
  618. else
  619. {
  620. // Include chars in prev set
  621. prevSet->set.UnionInPlace(ctAllocator, currSet->set);
  622. return prevSet;
  623. }
  624. }
  625. else
  626. // Can't merge
  627. return 0;
  628. }
  629. else
  630. // Can't merge
  631. return 0;
  632. }
  633. template <typename P, const bool IsLiteral>
  634. void Parser<P, IsLiteral>::DisjunctionPass0(int depth)
  635. {
  636. AlternativePass0(depth);
  637. while (true)
  638. {
  639. // Could be terminating 0
  640. if (ECLookahead() != '|')
  641. return;
  642. ECConsume();
  643. AlternativePass0(depth);
  644. }
  645. }
  646. template <typename P, const bool IsLiteral>
  647. Node* Parser<P, IsLiteral>::DisjunctionPass1()
  648. {
  649. // Maintain the invariants:
  650. // - alt lists have two or more items
  651. // - alt list items are never alt lists (so we must inline them)
  652. // - an alt list never contains two consecutive match-character/match-set nodes
  653. // (so we must union consecutive items into a single set)
  654. Node* node = AlternativePass1();
  655. AltNode* last = 0;
  656. // First node may be an alternative
  657. if (node->tag == Node::Alt)
  658. {
  659. last = (AltNode*)node;
  660. while (last->tail != 0)
  661. last = last->tail;
  662. }
  663. while (true)
  664. {
  665. // Could be terminating 0
  666. if (ECLookahead() != '|')
  667. return node;
  668. ECConsume(); // '|'
  669. Node* next = AlternativePass1();
  670. AnalysisAssert(next != nullptr);
  671. Node* revisedPrev = UnionNodes(last == 0 ? node : last->head, next);
  672. if (revisedPrev != 0)
  673. {
  674. // Can merge next into previously seen alternative
  675. if (last == 0)
  676. node = revisedPrev;
  677. else
  678. last->head = revisedPrev;
  679. }
  680. else if (next->tag == Node::Alt)
  681. {
  682. AltNode* nextList = (AltNode*)next;
  683. // Append inner list to current list
  684. revisedPrev = UnionNodes(last == 0 ? node : last->head, nextList->head);
  685. if (revisedPrev != 0)
  686. {
  687. // Can merge head of list into previously seen alternative
  688. if (last ==0)
  689. node = revisedPrev;
  690. else
  691. last->head = revisedPrev;
  692. nextList = nextList->tail;
  693. }
  694. AnalysisAssert(nextList != nullptr);
  695. if (last == 0)
  696. node = Anew(ctAllocator, AltNode, node, nextList);
  697. else
  698. last->tail = nextList;
  699. while (nextList->tail != 0)
  700. nextList = nextList->tail;
  701. last = nextList;
  702. }
  703. else
  704. {
  705. // Append node
  706. AltNode* cons = Anew(ctAllocator, AltNode, next, 0);
  707. if (last == 0)
  708. node = Anew(ctAllocator, AltNode, node, cons);
  709. else
  710. last->tail = cons;
  711. last = cons;
  712. }
  713. }
  714. }
  715. template <typename P, const bool IsLiteral>
  716. inline bool Parser<P, IsLiteral>::IsEndOfAlternative()
  717. {
  718. EncodedChar ec = ECLookahead();
  719. // Could be terminating 0, but embedded 0 is part of alternative
  720. return (ec == 0 && IsEOF()) || ec == ')' || ec == '|' || (IsLiteral && ec == '/');
  721. }
  722. template <typename P, const bool IsLiteral>
  723. void Parser<P, IsLiteral>::EnsureLitbuf(CharCount size)
  724. {
  725. if (litbufLen - litbufNext < size)
  726. {
  727. CharCount newLen = max(litbufLen, initLitbufSize);
  728. while (newLen < litbufNext + size)
  729. newLen *= 2;
  730. litbuf = (Char*)ctAllocator->Realloc(litbuf, litbufLen * sizeof(Char), newLen * sizeof(Char));
  731. litbufLen = newLen;
  732. }
  733. }
  734. template <typename P, const bool IsLiteral>
  735. void Parser<P, IsLiteral>::AccumLiteral(MatchLiteralNode* deferredLiteralNode, Node* charOrLiteralNode)
  736. {
  737. Assert(charOrLiteralNode->tag == Node::MatchChar || charOrLiteralNode->tag == Node::MatchLiteral);
  738. CharCount addLen = charOrLiteralNode->LiteralLength();
  739. Assert(addLen > 0);
  740. if (deferredLiteralNode->length == 0)
  741. {
  742. // Start a new literal
  743. EnsureLitbuf(addLen);
  744. deferredLiteralNode->offset = litbufNext;
  745. deferredLiteralNode->length = addLen;
  746. charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
  747. }
  748. else if (deferredLiteralNode->offset + deferredLiteralNode->length == litbufNext)
  749. {
  750. // Keep growing the current literal
  751. EnsureLitbuf(addLen);
  752. charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
  753. deferredLiteralNode->length += addLen;
  754. }
  755. else if (charOrLiteralNode->tag == Node::MatchLiteral && deferredLiteralNode->offset + deferredLiteralNode->length == ((MatchLiteralNode*)charOrLiteralNode)->offset)
  756. {
  757. // Absorb next literal into current literal since they are adjacent
  758. deferredLiteralNode->length += addLen;
  759. }
  760. else
  761. {
  762. // Abandon current literal and start a fresh one (leaves gap)
  763. EnsureLitbuf(deferredLiteralNode->length + addLen);
  764. js_wmemcpy_s(litbuf + litbufNext, litbufLen - litbufNext, litbuf + deferredLiteralNode->offset, deferredLiteralNode->length);
  765. deferredLiteralNode->offset = litbufNext;
  766. litbufNext += deferredLiteralNode->length;
  767. charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
  768. deferredLiteralNode->length += addLen;
  769. }
  770. }
  771. template <typename P, const bool IsLiteral>
  772. Node* Parser<P, IsLiteral>::FinalTerm(Node* node, MatchLiteralNode* deferredLiteralNode)
  773. {
  774. if (node == deferredLiteralNode)
  775. {
  776. #if DBG
  777. if (deferredLiteralNode->length == 0)
  778. Assert(false);
  779. #endif
  780. Assert(deferredLiteralNode->offset < litbufNext);
  781. Assert(deferredLiteralNode->offset + deferredLiteralNode->length <= litbufNext);
  782. if (deferredLiteralNode->length == 1)
  783. {
  784. node = Anew(ctAllocator, MatchCharNode, litbuf[deferredLiteralNode->offset]);
  785. if (deferredLiteralNode->offset + deferredLiteralNode->length == litbufNext)
  786. // Reclaim last added character
  787. litbufNext--;
  788. // else: leave a gap in the literal buffer
  789. }
  790. else
  791. node = Anew(ctAllocator, MatchLiteralNode, *deferredLiteralNode);
  792. deferredLiteralNode->offset = 0;
  793. deferredLiteralNode->length = 0;
  794. }
  795. return node;
  796. }
  797. template <typename P, const bool IsLiteral>
  798. void Parser<P, IsLiteral>::AlternativePass0(int depth)
  799. {
  800. while (!IsEndOfAlternative())
  801. TermPass0(depth);
  802. }
  803. template <typename P, const bool IsLiteral>
  804. Node* Parser<P, IsLiteral>::AlternativePass1()
  805. {
  806. if (IsEndOfAlternative())
  807. return Anew(ctAllocator, SimpleNode, Node::Empty);
  808. MatchCharNode deferredCharNode(0);
  809. MatchLiteralNode deferredLiteralNode(0, 0);
  810. // Maintain the invariants:
  811. // - concat lists have two or more items
  812. // - concat list items are never concat lists
  813. // - a concat list never contains two consecutive match-character/match-literal nodes
  814. bool previousSurrogatePart = false;
  815. Node* node = TermPass1(&deferredCharNode, previousSurrogatePart);
  816. AnalysisAssert(node != nullptr);
  817. ConcatNode* last = 0;
  818. // First node may be a concat
  819. if (node->tag == Node::Concat)
  820. {
  821. last = (ConcatNode*)node;
  822. while (last->tail != 0)
  823. last = last->tail;
  824. }
  825. if (last == 0)
  826. {
  827. if (node->LiteralLength() > 0)
  828. {
  829. // Begin a new literal
  830. AccumLiteral(&deferredLiteralNode, node);
  831. node = &deferredLiteralNode;
  832. }
  833. }
  834. else
  835. {
  836. if (last->head->LiteralLength() > 0)
  837. {
  838. // Begin a new literal
  839. AccumLiteral(&deferredLiteralNode, last->head);
  840. last->head = &deferredLiteralNode;
  841. }
  842. }
  843. while (!IsEndOfAlternative())
  844. {
  845. Node* next = TermPass1(&deferredCharNode, previousSurrogatePart);
  846. AnalysisAssert(next != nullptr);
  847. if (next->LiteralLength() > 0)
  848. {
  849. // Begin a new literal or grow the existing literal
  850. AccumLiteral(&deferredLiteralNode, next);
  851. if (last == 0)
  852. {
  853. if (node != &deferredLiteralNode)
  854. {
  855. // So far we have first item and the current literal
  856. ConcatNode* cons = Anew(ctAllocator, ConcatNode, &deferredLiteralNode, 0);
  857. node = Anew(ctAllocator, ConcatNode, node, cons);
  858. last = cons;
  859. }
  860. // else: keep growing first literal
  861. }
  862. else
  863. {
  864. if (last->head != &deferredLiteralNode)
  865. {
  866. // Append a new literal node
  867. ConcatNode* cons = Anew(ctAllocator, ConcatNode, &deferredLiteralNode, 0);
  868. last->tail = cons;
  869. last = cons;
  870. }
  871. // else: keep growing current literal
  872. }
  873. }
  874. else if (next->tag == Node::Concat)
  875. {
  876. // Append this list to accumulated list
  877. ConcatNode* nextList = (ConcatNode*)next;
  878. if (nextList->head->LiteralLength() > 0 &&
  879. ((last == 0 && node == &deferredLiteralNode) ||
  880. (last != 0 && last->head == &deferredLiteralNode)))
  881. {
  882. // Absorb the next character or literal into the current literal
  883. // (may leave a gab in litbuf)
  884. AccumLiteral(&deferredLiteralNode, nextList->head);
  885. nextList = nextList->tail;
  886. // List has at least two items
  887. AnalysisAssert(nextList != 0);
  888. // List should be in canonical form, so no consecutive chars/literals
  889. Assert(nextList->head->LiteralLength() == 0);
  890. }
  891. if (last == 0)
  892. node = Anew(ctAllocator, ConcatNode, FinalTerm(node, &deferredLiteralNode), nextList);
  893. else
  894. {
  895. last->head = FinalTerm(last->head, &deferredLiteralNode);
  896. last->tail = nextList;
  897. }
  898. while (nextList->tail != 0)
  899. nextList = nextList->tail;
  900. last = nextList;
  901. // No outstanding literals
  902. Assert(deferredLiteralNode.length == 0);
  903. if (last->head->LiteralLength() > 0)
  904. {
  905. // If the list ends with a literal, transfer it into deferredLiteralNode
  906. // so we can continue accumulating (won't leave a gab in litbuf)
  907. AccumLiteral(&deferredLiteralNode, last->head);
  908. // Can discard MatchLiteralNode since it lives in compile-time allocator
  909. last->head = &deferredLiteralNode;
  910. }
  911. }
  912. else
  913. {
  914. // Append this node to accumulated list
  915. ConcatNode* cons = Anew(ctAllocator, ConcatNode, next, 0);
  916. if (last == 0)
  917. node = Anew(ctAllocator, ConcatNode, FinalTerm(node, &deferredLiteralNode), cons);
  918. else
  919. {
  920. last->head = FinalTerm(last->head, &deferredLiteralNode);
  921. last->tail = cons;
  922. }
  923. last = cons;
  924. // No outstanding literals
  925. Assert(deferredLiteralNode.length == 0);
  926. }
  927. }
  928. if (last == 0)
  929. node = FinalTerm(node, &deferredLiteralNode);
  930. else
  931. last->head = FinalTerm(last->head, &deferredLiteralNode);
  932. // No outstanding literals
  933. Assert(deferredLiteralNode.length == 0);
  934. return node;
  935. }
  936. //
  937. // Terms
  938. //
  939. template <typename P, const bool IsLiteral>
  940. Node* Parser<P, IsLiteral>::NewLoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body)
  941. {
  942. //
  943. // NOTE: We'd like to represent r? (i.e. r{0,1}) as r|<empty> since the loop representation has high overhead.
  944. // HOWEVER if r contains a group definition and could match empty then we must execute as a loop
  945. // so that group bindings are correctly reset on no progress (e.g.: /(a*)?/.exec("")). Thus we defer
  946. // this optimization until pass 4 of the optimizer, at which point we know whether r could match empty.
  947. //
  948. if (lower == 1 && upper == 1)
  949. return body;
  950. else if (lower == 0 && upper == 0)
  951. // Loop is equivalent to empty. If the loop body contains group definitions they will have already been
  952. // counted towards the overall number of groups. The matcher will initialize their contents to
  953. // undefined, and since the loop body would never execute the inner groups could never be updated from
  954. // undefined.
  955. return Anew(ctAllocator, SimpleNode, Node::Empty);
  956. else
  957. return Anew(ctAllocator, LoopNode, lower, upper, isGreedy, body);
  958. }
  959. template <typename P, const bool IsLiteral>
  960. bool Parser<P, IsLiteral>::AtQuantifier()
  961. {
  962. // Could be terminating 0
  963. switch (ECLookahead())
  964. {
  965. case '*':
  966. case '+':
  967. case '?':
  968. return true;
  969. case '{':
  970. {
  971. CharCount lookahead = 1;
  972. while (ECCanConsume(lookahead + 1) && standardEncodedChars->IsDigit(ECLookahead(lookahead)))
  973. lookahead++;
  974. if (lookahead == 1 || !ECCanConsume(lookahead + 1))
  975. return false;
  976. switch (ECLookahead(lookahead))
  977. {
  978. case ',':
  979. lookahead++;
  980. if (ECCanConsume(lookahead + 1) && ECLookahead(lookahead) == '}')
  981. return true;
  982. else
  983. {
  984. CharCount saved = lookahead;
  985. while (ECCanConsume(lookahead + 1) && standardEncodedChars->IsDigit(ECLookahead(lookahead)))
  986. lookahead++;
  987. if (lookahead == saved)
  988. return false;
  989. return ECCanConsume(lookahead + 1) && ECLookahead(lookahead) == '}';
  990. }
  991. case '}':
  992. return true;
  993. default:
  994. return false;
  995. }
  996. }
  997. default:
  998. return false;
  999. }
  1000. }
  1001. template <typename P, const bool IsLiteral>
  1002. bool Parser<P, IsLiteral>::OptNonGreedy()
  1003. {
  1004. // Could be terminating 0
  1005. if (ECLookahead() != '?')
  1006. return true;
  1007. ECConsume();
  1008. return false;
  1009. }
  1010. template <typename P, const bool IsLiteral>
  1011. CharCount Parser<P, IsLiteral>::RepeatCount()
  1012. {
  1013. CharCount n = 0;
  1014. int digits = 0;
  1015. while (true)
  1016. {
  1017. // Could be terminating 0
  1018. EncodedChar ec = ECLookahead();
  1019. if (!standardEncodedChars->IsDigit(ec))
  1020. {
  1021. if (digits == 0)
  1022. Fail(JSERR_RegExpSyntax);
  1023. return n;
  1024. }
  1025. if (n > MaxCharCount / 10)
  1026. Fail(JSERR_RegExpSyntax);
  1027. n *= 10;
  1028. if (n > MaxCharCount - standardEncodedChars->DigitValue(ec))
  1029. Fail(JSERR_RegExpSyntax);
  1030. n += standardEncodedChars->DigitValue(ec);
  1031. digits++;
  1032. ECConsume();
  1033. }
  1034. }
  1035. template <typename P, const bool IsLiteral>
  1036. void Parser<P, IsLiteral>::TermPass0(int depth)
  1037. {
  1038. PROBE_STACK(scriptContext, Js::Constants::MinStackRegex);
  1039. // Either we have a location at the start, or the end, never both. As in between it should have been cleared if surrogate pair
  1040. // Or must be cleared if we didn't perform the check
  1041. bool clearLocationIfPresent = this->tempLocationOfSurrogatePair != nullptr;
  1042. switch (ECLookahead())
  1043. {
  1044. case '^':
  1045. case '$':
  1046. ECConsume();
  1047. return;
  1048. case '\\':
  1049. ECConsume();
  1050. if (AtomEscapePass0())
  1051. return;
  1052. break;
  1053. case '(':
  1054. // Can't combine into a single codeunit because of group present
  1055. this->tempLocationOfSurrogatePair = nullptr;
  1056. this->codePointAtTempLocation = 0;
  1057. clearLocationIfPresent = false;
  1058. ECConsume();
  1059. switch (ECLookahead())
  1060. {
  1061. case '?':
  1062. if (!ECCanConsume(2))
  1063. Fail(JSERR_RegExpSyntax);
  1064. switch (ECLookahead(1))
  1065. {
  1066. case '=':
  1067. case '!':
  1068. case ':':
  1069. ECConsume(2);
  1070. break;
  1071. default:
  1072. numGroups++;
  1073. break;
  1074. }
  1075. break;
  1076. default:
  1077. numGroups++;
  1078. break;
  1079. }
  1080. DisjunctionPass0(depth + 1);
  1081. ECMust(')', JSERR_RegExpNoParen);
  1082. break;
  1083. case '.':
  1084. ECConsume();
  1085. break;
  1086. case '[':
  1087. ECConsume();
  1088. this->tempLocationOfSurrogatePair = nullptr;
  1089. this->codePointAtTempLocation = 0;
  1090. this->tempLocationOfRange = next;
  1091. CharacterClassPass0();
  1092. this->tempLocationOfRange = nullptr;
  1093. ECMust(']', JSERR_RegExpNoBracket);
  1094. break;
  1095. case ')':
  1096. case '|':
  1097. Fail(JSERR_RegExpSyntax);
  1098. break;
  1099. case ']':
  1100. case '}':
  1101. NextChar();
  1102. break;
  1103. case '*':
  1104. case '+':
  1105. case '?':
  1106. case '{':
  1107. if (AtQuantifier())
  1108. Fail(JSERR_RegExpBadQuant);
  1109. else
  1110. ECConsume();
  1111. break;
  1112. case 0:
  1113. if (IsEOF())
  1114. // Terminating 0
  1115. Fail(JSERR_RegExpSyntax);
  1116. // else fall-through for embedded 0
  1117. default:
  1118. {
  1119. const EncodedChar* current = next;
  1120. Char c = NextChar();
  1121. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1122. {
  1123. TrackIfSurrogatePair(c, current, (uint32)(next - current));
  1124. }
  1125. // Closing '/' in literals should be caught explicitly
  1126. Assert(!IsLiteral || c != '/');
  1127. }
  1128. break;
  1129. }
  1130. if (clearLocationIfPresent && this->tempLocationOfSurrogatePair != nullptr)
  1131. {
  1132. this->tempLocationOfSurrogatePair = nullptr;
  1133. this->codePointAtTempLocation = 0;
  1134. }
  1135. if (AtQuantifier())
  1136. {
  1137. switch (ECLookahead())
  1138. {
  1139. case '*':
  1140. case '+':
  1141. case '?':
  1142. ECConsume();
  1143. OptNonGreedy();
  1144. break;
  1145. case '{':
  1146. {
  1147. ECConsume();
  1148. CharCount lower = RepeatCount();
  1149. switch (ECLookahead())
  1150. {
  1151. case ',':
  1152. ECConsume();
  1153. if (ECLookahead() == '}')
  1154. {
  1155. ECConsume();
  1156. OptNonGreedy();
  1157. }
  1158. else
  1159. {
  1160. CharCount upper = RepeatCount();
  1161. if (upper < lower)
  1162. Fail(JSERR_RegExpSyntax);
  1163. Assert(ECLookahead() == '}');
  1164. ECConsume();
  1165. OptNonGreedy();
  1166. }
  1167. break;
  1168. case '}':
  1169. ECConsume();
  1170. OptNonGreedy();
  1171. break;
  1172. default:
  1173. Assert(false);
  1174. break;
  1175. }
  1176. break;
  1177. }
  1178. default:
  1179. Assert(false);
  1180. break;
  1181. }
  1182. }
  1183. }
  1184. template <typename P, const bool IsLiteral>
  1185. Node* Parser<P, IsLiteral>::TermPass1(MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
  1186. {
  1187. PROBE_STACK(scriptContext, Js::Constants::MinStackRegex);
  1188. Node* node = 0;
  1189. bool containsSurrogatePair = false;
  1190. switch (ECLookahead())
  1191. {
  1192. case '^':
  1193. ECConsume();
  1194. return Anew(ctAllocator, SimpleNode, Node::BOL); // No quantifier allowed
  1195. case '$':
  1196. ECConsume();
  1197. return Anew(ctAllocator, SimpleNode, Node::EOL); // No quantifier allowed
  1198. case '\\':
  1199. ECConsume();
  1200. if(SurrogatePairPass1(node, deferredCharNode, previousSurrogatePart))
  1201. {
  1202. break; // For quantifier
  1203. }
  1204. else if (AtomEscapePass1(node, deferredCharNode, previousSurrogatePart))
  1205. {
  1206. return node; // No quantifier allowed
  1207. }
  1208. break; // else: fall-through for opt quantifier
  1209. case '(':
  1210. ECConsume();
  1211. switch (ECLookahead())
  1212. {
  1213. case '?':
  1214. switch (ECLookahead(1))
  1215. {
  1216. case '=':
  1217. ECConsume(2); // ?=
  1218. node = DisjunctionPass1();
  1219. Assert(ECLookahead() == ')');
  1220. ECConsume(); // )
  1221. node = Anew(ctAllocator, AssertionNode, false, node);
  1222. break; // As per Annex B, allow this to be quantifiable
  1223. case '!':
  1224. ECConsume(2); // ?!
  1225. node = DisjunctionPass1();
  1226. Assert(ECLookahead() == ')');
  1227. ECConsume(); // )
  1228. node = Anew(ctAllocator, AssertionNode, true, node);
  1229. break; // As per Annex B, allow this to be quantifiable
  1230. case ':':
  1231. ECConsume(2); // ?:
  1232. node = DisjunctionPass1();
  1233. Assert(ECLookahead() == ')');
  1234. ECConsume(); // )
  1235. break; // fall-through for opt quantifier
  1236. default:
  1237. {
  1238. // ? not yet consumed
  1239. int thisGroupId = nextGroupId++;
  1240. node = DisjunctionPass1();
  1241. Assert(ECLookahead() == ')');
  1242. ECConsume(); // )
  1243. node = Anew(ctAllocator, DefineGroupNode, thisGroupId, node);
  1244. break; // fall-through for opt quantifier
  1245. }
  1246. }
  1247. break;
  1248. default:
  1249. {
  1250. // next char not yet consumed
  1251. int thisGroupId = nextGroupId++;
  1252. node = DisjunctionPass1();
  1253. Assert(ECLookahead() == ')');
  1254. ECConsume(); // )
  1255. node = Anew(ctAllocator, DefineGroupNode, thisGroupId, node);
  1256. break; // fall-through for opt quantifier
  1257. }
  1258. }
  1259. break;
  1260. case '.':
  1261. {
  1262. ECConsume();
  1263. node = GetNodeWithValidCharacterSet('.');
  1264. break; // fall-through for opt quantifier
  1265. }
  1266. case '[':
  1267. ECConsume();
  1268. if (unicodeFlagPresent)
  1269. {
  1270. containsSurrogatePair = this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->rangeLocation == next;
  1271. }
  1272. node = containsSurrogatePair ? CharacterClassPass1<true>() : CharacterClassPass1<false>();
  1273. Assert(ECLookahead() == ']');
  1274. ECConsume(); // ]
  1275. break; // fall-through for opt quantifier
  1276. #if DBG
  1277. case ')':
  1278. case '|':
  1279. Assert(false);
  1280. break;
  1281. #endif
  1282. case ']':
  1283. // SPEC DEVIATION: This should be syntax error, instead accept as itself
  1284. deferredCharNode->cs[0] = NextChar();
  1285. node = deferredCharNode;
  1286. break; // fall-through for opt quantifier
  1287. #if DBG
  1288. case '*':
  1289. case '+':
  1290. case '?':
  1291. AssertMsg(false, "Allowed only in the escaped form. These should be caught by TermPass0.");
  1292. break;
  1293. #endif
  1294. case 0:
  1295. if (IsEOF())
  1296. // Terminating 0
  1297. Fail(JSERR_RegExpSyntax);
  1298. // else fall-through for embedded 0
  1299. default:
  1300. if(SurrogatePairPass1(node, deferredCharNode, previousSurrogatePart))
  1301. {
  1302. break; //For quantifier
  1303. }
  1304. else
  1305. {
  1306. deferredCharNode->cs[0] = NextChar();
  1307. node = deferredCharNode;
  1308. break; // fall-through for opt quantifier
  1309. }
  1310. }
  1311. Assert(node != 0);
  1312. if (AtQuantifier())
  1313. {
  1314. switch (ECLookahead())
  1315. {
  1316. case '*':
  1317. if (node == deferredCharNode)
  1318. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1319. ECConsume();
  1320. return NewLoopNode(0, CharCountFlag, OptNonGreedy(), node);
  1321. case '+':
  1322. if (node == deferredCharNode)
  1323. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1324. ECConsume();
  1325. return NewLoopNode(1, CharCountFlag, OptNonGreedy(), node);
  1326. case '?':
  1327. if (node == deferredCharNode)
  1328. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1329. ECConsume();
  1330. return NewLoopNode(0, 1, OptNonGreedy(), node);
  1331. case '{':
  1332. {
  1333. if (node == deferredCharNode)
  1334. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1335. ECConsume();
  1336. CharCount lower = RepeatCount();
  1337. switch (ECLookahead())
  1338. {
  1339. case ',':
  1340. ECConsume();
  1341. if (ECLookahead() == '}')
  1342. {
  1343. ECConsume();
  1344. return NewLoopNode(lower, CharCountFlag, OptNonGreedy(), node);
  1345. }
  1346. else
  1347. {
  1348. CharCount upper = RepeatCount();
  1349. Assert(lower <= upper);
  1350. Assert(ECLookahead() == '}');
  1351. ECConsume(); // }
  1352. return NewLoopNode(lower, upper, OptNonGreedy(), node);
  1353. }
  1354. case '}':
  1355. ECConsume();
  1356. return NewLoopNode(lower, lower, OptNonGreedy(), node);
  1357. default:
  1358. Assert(false);
  1359. break;
  1360. }
  1361. break;
  1362. }
  1363. default:
  1364. Assert(false);
  1365. break;
  1366. }
  1367. }
  1368. return node;
  1369. }
  1370. #pragma warning(push)
  1371. #pragma warning(disable:4702) // unreachable code
  1372. template <typename P, const bool IsLiteral>
  1373. bool Parser<P, IsLiteral>::AtomEscapePass0()
  1374. {
  1375. EncodedChar ec = ECLookahead();
  1376. if (ec == 0 && IsEOF())
  1377. {
  1378. // Terminating 0
  1379. Fail(JSERR_RegExpSyntax);
  1380. return false;
  1381. }
  1382. else if (standardEncodedChars->IsDigit(ec))
  1383. {
  1384. do
  1385. {
  1386. ECConsume();
  1387. }
  1388. while (standardEncodedChars->IsDigit(ECLookahead())); // terminating 0 is not a digit
  1389. return false;
  1390. }
  1391. else if (ECLookahead() == 'c')
  1392. {
  1393. if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
  1394. ECConsume(2);
  1395. return false;
  1396. }
  1397. else
  1398. {
  1399. const EncodedChar *current = next;
  1400. // An escaped '/' is ok
  1401. Char c = NextChar();
  1402. switch (c)
  1403. {
  1404. case 'b':
  1405. case 'B':
  1406. return true;
  1407. // case 'c': handled as special case above
  1408. case 'x':
  1409. if (ECCanConsume(2) &&
  1410. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1411. standardEncodedChars->IsHex(ECLookahead(1)))
  1412. ECConsume(2);
  1413. break;
  1414. case 'u':
  1415. bool surrogateEncountered = false;
  1416. int lengthOfSurrogate = TryParseExtendedUnicodeEscape(c, surrogateEncountered, true);
  1417. if (lengthOfSurrogate > 0)
  1418. {
  1419. if (surrogateEncountered)
  1420. {
  1421. // If we don't have an allocator, we don't create nodes
  1422. // Asserts in place as extra checks for when we do have an allocator
  1423. Assert(this->ctAllocator == nullptr || this->currentSurrogatePairNode != nullptr);
  1424. Assert(this->ctAllocator == nullptr || current == this->currentSurrogatePairNode->location);
  1425. ECConsume(lengthOfSurrogate);
  1426. }
  1427. //Don't fall through
  1428. break;
  1429. }
  1430. else if (ECCanConsume(4) &&
  1431. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1432. standardEncodedChars->IsHex(ECLookahead(1)) &&
  1433. standardEncodedChars->IsHex(ECLookahead(2)) &&
  1434. standardEncodedChars->IsHex(ECLookahead(3)))
  1435. {
  1436. if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1437. {
  1438. codepoint_t value = (standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  1439. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  1440. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  1441. (standardEncodedChars->DigitValue(ECLookahead(3)));
  1442. TrackIfSurrogatePair(value, (next - 1), 5);
  1443. }
  1444. ECConsume(4);
  1445. }
  1446. break;
  1447. }
  1448. // embedded 0 is ok
  1449. return false;
  1450. }
  1451. }
  1452. #pragma warning(pop)
  1453. template <typename P, const bool IsLiteral>
  1454. bool Parser<P, IsLiteral>::AtomEscapePass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
  1455. {
  1456. Assert(!IsEOF()); // checked for terminating 0 in pass 0
  1457. if (standardEncodedChars->IsDigit(ECLookahead()))
  1458. {
  1459. // As per Annex B, allow octal escapes as well as group references, disambiguate based on known
  1460. // number of groups.
  1461. if (ECLookahead() == '0')
  1462. {
  1463. // fall through for octal
  1464. }
  1465. else
  1466. {
  1467. // Could be a group reference, but only if between 1 and 5 digits which resolve to a valid group number
  1468. int n = 0;
  1469. CharCount digits = 0;
  1470. do
  1471. {
  1472. n = n * 10 + (int)standardEncodedChars->DigitValue(ECLookahead(digits));
  1473. digits++;
  1474. }
  1475. while (digits < 5 && ECCanConsume(digits + 1) && standardEncodedChars->IsDigit(ECLookahead(digits)));
  1476. if (n >= numGroups || ECCanConsume(digits + 1) && standardEncodedChars->IsDigit(ECLookahead(digits)))
  1477. {
  1478. if (standardEncodedChars->IsOctal(ECLookahead()))
  1479. {
  1480. // fall through for octal
  1481. }
  1482. else
  1483. {
  1484. // \8 and \9 are identity escapes
  1485. deferredCharNode->cs[0] = Chars<EncodedChar>::CTW(ECLookahead());
  1486. ECConsume();
  1487. node = deferredCharNode;
  1488. return false; // not an assertion
  1489. }
  1490. }
  1491. else
  1492. {
  1493. ECConsume(digits);
  1494. node = Anew(ctAllocator, MatchGroupNode, n);
  1495. return false; // not an assertion
  1496. }
  1497. }
  1498. // Must be between 1 and 3 octal digits
  1499. Assert(standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not an octal
  1500. uint n = 0;
  1501. CharCount digits = 0;
  1502. do
  1503. {
  1504. uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
  1505. if (m > Chars<uint8>::MaxUChar) // Regex octal codes only support single byte (ASCII) characters.
  1506. break;
  1507. n = m;
  1508. ECConsume();
  1509. digits++;
  1510. }
  1511. while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not an octal
  1512. deferredCharNode->cs[0] = UTC((UChar)n);
  1513. node = deferredCharNode;
  1514. return false; // not an assertion
  1515. }
  1516. else if (ECLookahead() == 'c')
  1517. {
  1518. Char c;
  1519. if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
  1520. {
  1521. c = UTC(Chars<EncodedChar>::CTU(ECLookahead(1)) % 32);
  1522. ECConsume(2);
  1523. }
  1524. else
  1525. {
  1526. // SPEC DEVIATION: For non-letters or EOF, take the leading '\' to be itself, and
  1527. // don't consume the 'c' or letter.
  1528. c = '\\';
  1529. }
  1530. deferredCharNode->cs[0] = c;
  1531. node = deferredCharNode;
  1532. return false; // not an assertion
  1533. }
  1534. else
  1535. {
  1536. Char c = NextChar();
  1537. switch (c)
  1538. {
  1539. case 'b':
  1540. node = Anew(ctAllocator, WordBoundaryNode, false);
  1541. return true; // Is an assertion
  1542. case 'B':
  1543. node = Anew(ctAllocator, WordBoundaryNode, true);
  1544. return true; // Is an assertion
  1545. case 'f':
  1546. c = '\f';
  1547. break; // fall-through for identity escape
  1548. case 'n':
  1549. c = '\n';
  1550. break; // fall-through for identity escape
  1551. case 'r':
  1552. c = '\r';
  1553. break; // fall-through for identity escape
  1554. case 't':
  1555. c = '\t';
  1556. break; // fall-through for identity escape
  1557. case 'v':
  1558. c = '\v';
  1559. break; // fall-through for identity escape
  1560. case 'd':
  1561. {
  1562. MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
  1563. standardChars->SetDigits(ctAllocator, setNode->set);
  1564. node = setNode;
  1565. return false; // not an assertion
  1566. }
  1567. case 'D':
  1568. {
  1569. node = GetNodeWithValidCharacterSet('D');
  1570. return false; // not an assertion
  1571. }
  1572. case 's':
  1573. {
  1574. MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
  1575. standardChars->SetWhitespace(ctAllocator, setNode->set);
  1576. node = setNode;
  1577. return false; // not an assertion
  1578. }
  1579. case 'S':
  1580. {
  1581. node = GetNodeWithValidCharacterSet('S');
  1582. return false; // not an assertion
  1583. }
  1584. case 'w':
  1585. {
  1586. MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
  1587. standardChars->SetWordChars(ctAllocator, setNode->set);
  1588. node = setNode;
  1589. return false; // not an assertion
  1590. }
  1591. case 'W':
  1592. {
  1593. node = GetNodeWithValidCharacterSet('W');
  1594. return false; // not an assertion
  1595. }
  1596. // case 'c': handled as special case above
  1597. case 'x':
  1598. if (ECCanConsume(2) &&
  1599. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1600. standardEncodedChars->IsHex(ECLookahead(1)))
  1601. {
  1602. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
  1603. (standardEncodedChars->DigitValue(ECLookahead(1))));
  1604. ECConsume(2);
  1605. // fall-through for identity escape
  1606. }
  1607. // Take to be identity escape if ill-formed as per Annex B
  1608. break;
  1609. case 'u':
  1610. if (unicodeFlagPresent && TryParseExtendedUnicodeEscape(c, previousSurrogatePart) > 0)
  1611. break;
  1612. else if (ECCanConsume(4) &&
  1613. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1614. standardEncodedChars->IsHex(ECLookahead(1)) &&
  1615. standardEncodedChars->IsHex(ECLookahead(2)) &&
  1616. standardEncodedChars->IsHex(ECLookahead(3)))
  1617. {
  1618. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  1619. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  1620. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  1621. (standardEncodedChars->DigitValue(ECLookahead(3))));
  1622. ECConsume(4);
  1623. // fall-through for identity escape
  1624. }
  1625. // Take to be identity escape if ill-formed as per Annex B
  1626. break;
  1627. default:
  1628. // As per Annex B, allow anything other than newlines and above. Embedded 0 is ok
  1629. break;
  1630. }
  1631. // Must be an identity escape
  1632. deferredCharNode->cs[0] = c;
  1633. node = deferredCharNode;
  1634. return false; // not an assertion
  1635. }
  1636. }
  1637. template <typename P, const bool IsLiteral>
  1638. bool Parser<P, IsLiteral>::SurrogatePairPass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
  1639. {
  1640. if (!this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() || !unicodeFlagPresent)
  1641. {
  1642. return false;
  1643. }
  1644. if (this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
  1645. {
  1646. AssertMsg(!this->currentSurrogatePairNode->IsInsideRange(), "Should not be calling this pass if we are currently inside a range.");
  1647. char16 lower, upper;
  1648. uint tableIndex = 0, actualHigh = 0;
  1649. codepoint_t equivClass[CaseInsensitive::EquivClassSize];
  1650. if (caseInsensitiveFlagPresent && CaseInsensitive::RangeToEquivClass(tableIndex, this->currentSurrogatePairNode->value, this->currentSurrogatePairNode->value, actualHigh, equivClass))
  1651. {
  1652. Node *equivNode[CaseInsensitive::EquivClassSize];
  1653. int indexForNextNode = 0;
  1654. for (int i = 0; i < CaseInsensitive::EquivClassSize; i++)
  1655. {
  1656. bool alreadyAdded = false;
  1657. for (int j = 0; j < i; j++)
  1658. {
  1659. if (equivClass[i] == equivClass[j])
  1660. {
  1661. alreadyAdded = true;
  1662. break;
  1663. }
  1664. }
  1665. if (!alreadyAdded)
  1666. {
  1667. if (Js::NumberUtilities::IsInSupplementaryPlane(equivClass[i]))
  1668. {
  1669. Js::NumberUtilities::CodePointAsSurrogatePair(equivClass[i], &lower, &upper);
  1670. equivNode[indexForNextNode] = CreateSurrogatePairAtom(lower, upper);
  1671. }
  1672. else
  1673. {
  1674. equivNode[indexForNextNode] = Anew(ctAllocator, MatchCharNode, (Char)equivClass[i]);
  1675. }
  1676. indexForNextNode ++;
  1677. }
  1678. }
  1679. Assert(indexForNextNode > 0);
  1680. if (indexForNextNode == 1)
  1681. {
  1682. node = equivNode[0];
  1683. }
  1684. else
  1685. {
  1686. AltNode *altNode = Anew(ctAllocator, AltNode, equivNode[0], nullptr);
  1687. AltNode *altNodeTail = altNode;
  1688. for (int i = 1; i < indexForNextNode; i++)
  1689. {
  1690. altNodeTail->tail = Anew(ctAllocator, AltNode, equivNode[i], nullptr);
  1691. altNodeTail = altNodeTail->tail;
  1692. }
  1693. node = altNode;
  1694. }
  1695. }
  1696. else
  1697. {
  1698. Js::NumberUtilities::CodePointAsSurrogatePair(this->currentSurrogatePairNode->value, &lower, &upper);
  1699. node = CreateSurrogatePairAtom(lower, upper);
  1700. }
  1701. previousSurrogatePart = false;
  1702. Assert(ECCanConsume(this->currentSurrogatePairNode->length));
  1703. ECConsumeMultiUnit(this->currentSurrogatePairNode->length);
  1704. RestoreMultiUnits(this->currentSurrogatePairNode->multiUnits);
  1705. this->currentSurrogatePairNode = this->currentSurrogatePairNode->next;
  1706. return true;
  1707. }
  1708. return false;
  1709. }
  1710. //
  1711. // Classes
  1712. //
  1713. template <typename P, const bool IsLiteral>
  1714. bool Parser<P, IsLiteral>::AtSecondSingletonClassAtom()
  1715. {
  1716. Assert(ECLookahead() == '-');
  1717. if (ECLookahead(1) == '\\')
  1718. {
  1719. switch (ECLookahead(2))
  1720. {
  1721. case 'd':
  1722. case 'D':
  1723. case 's':
  1724. case 'S':
  1725. case 'w':
  1726. case 'W':
  1727. // These all denote non-singleton sets
  1728. return false;
  1729. default:
  1730. // fall-through for singleton
  1731. break;
  1732. }
  1733. }
  1734. return true;
  1735. }
  1736. template <typename P, const bool IsLiteral>
  1737. void Parser<P, IsLiteral>::CharacterClassPass0()
  1738. {
  1739. // Could be terminating 0
  1740. if (ECLookahead() == '^')
  1741. ECConsume();
  1742. EncodedChar nextChar = ECLookahead();
  1743. const EncodedChar* current;
  1744. codepoint_t lastCodepoint = INVALID_CODEPOINT;
  1745. codepoint_t pendingRangeStart = INVALID_CODEPOINT;
  1746. codepoint_t pendingRangeEnd = INVALID_CODEPOINT;
  1747. bool previousSurrogatePart = false;
  1748. while(nextChar != ']')
  1749. {
  1750. current = next;
  1751. if (nextChar == '\0' && IsEOF())
  1752. {
  1753. // Report as unclosed '['
  1754. Fail(JSERR_RegExpNoBracket);
  1755. return;
  1756. } // Otherwise embedded '\0' is ok
  1757. else if (nextChar == '\\')
  1758. {
  1759. // Consume, as classescapepass0 expects for it to be consumed
  1760. Char outChar = NextChar();
  1761. // If previousSurrogatePart = true upon leaving this method, then we are going to pass through here twice
  1762. // This is because \u{} escape sequence was encountered that is actually 2 characters, the second time we will pass consuming entire character
  1763. if (ClassEscapePass0(outChar, previousSurrogatePart))
  1764. {
  1765. lastCodepoint = outChar;
  1766. }
  1767. else
  1768. {
  1769. // Last codepoint isn't a singleton, so no codepoint tracking for the sake of ranges is needed.
  1770. lastCodepoint = INVALID_CODEPOINT;
  1771. // Unless we have a possible range end, cancel our range tracking.
  1772. if (pendingRangeEnd == INVALID_CODEPOINT)
  1773. {
  1774. pendingRangeStart = INVALID_CODEPOINT;
  1775. }
  1776. }
  1777. }
  1778. else if (nextChar == '-')
  1779. {
  1780. if (pendingRangeStart != INVALID_CODEPOINT || lastCodepoint == INVALID_CODEPOINT)
  1781. {
  1782. // '-' is the upper part of the range, with pendingRangeStart codepoint is the lower. Set lastCodePoint to '-' to check at the end of the while statement
  1783. // OR
  1784. // '-' is just a char, consume it and set as last char
  1785. lastCodepoint = NextChar();
  1786. }
  1787. else
  1788. {
  1789. pendingRangeStart = this->next == this->positionAfterLastSurrogate
  1790. ? this->valueOfLastSurrogate
  1791. : lastCodepoint;
  1792. lastCodepoint = INVALID_CODEPOINT;
  1793. NextChar();
  1794. }
  1795. // If we have a pattern of the form [\ud800-\udfff], we need this to be interpreted as a range.
  1796. // In order to achieve this, the two variables that we use to track surrogate pairs, namely
  1797. // tempLocationOfSurrogatePair and previousSurrogatePart, need to be in a certain state.
  1798. //
  1799. // We need to reset tempLocationOfSurrogatePair as it points to the first Unicode escape (\ud800)
  1800. // when we're here. We need to clear it in order not to have a surrogate pair when we process the
  1801. // second escape (\udfff).
  1802. //
  1803. // previousSurrogatePart is used when we have a code point in the \u{...} extended format and the
  1804. // character is in a supplementary plane. However, there is no need to change its value here. When
  1805. // such an escape sequence is encountered, the first call to ClassEscapePass0() sets the variable
  1806. // to true, but it rewinds the input back to the beginning of the escape sequence. The next
  1807. // iteration of the loop here will again call ClassEscape0() with the same character and the
  1808. // variable will this time be set to false. Therefore, the variable will always be false here.
  1809. tempLocationOfSurrogatePair = nullptr;
  1810. Assert(!previousSurrogatePart);
  1811. }
  1812. else
  1813. {
  1814. lastCodepoint = NextChar();
  1815. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1816. {
  1817. TrackIfSurrogatePair(lastCodepoint, current, (uint32)(next - current));
  1818. }
  1819. }
  1820. if (!scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1821. {
  1822. // This is much easier to handle.
  1823. if (pendingRangeStart != INVALID_CODEPOINT && lastCodepoint != INVALID_CODEPOINT)
  1824. {
  1825. Assert(pendingRangeStart < 0x10000);
  1826. Assert(pendingRangeEnd == INVALID_CODEPOINT);
  1827. if (pendingRangeStart > lastCodepoint)
  1828. {
  1829. Fail(JSERR_RegExpBadRange);
  1830. }
  1831. pendingRangeStart = lastCodepoint = INVALID_CODEPOINT;
  1832. }
  1833. }
  1834. // We have a candidate for range end, and we have a range start.
  1835. else if (pendingRangeStart != INVALID_CODEPOINT && (lastCodepoint != INVALID_CODEPOINT || pendingRangeEnd != INVALID_CODEPOINT))
  1836. {
  1837. // The following will be true at the end of each surrogate pair parse.
  1838. // Note, the escape sequence \u{} is a two time parse, so this will be true on the second time around.
  1839. if (this->next == this->positionAfterLastSurrogate)
  1840. {
  1841. lastCodepoint = this->valueOfLastSurrogate;
  1842. Assert(!previousSurrogatePart);
  1843. pendingRangeEnd = lastCodepoint;
  1844. lastCodepoint = INVALID_CODEPOINT;
  1845. }
  1846. // If we the next character is the end of range ']', then we can't have a surrogate pair.
  1847. // The current character is the range end, if we don't already have a candidate.
  1848. else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)
  1849. {
  1850. pendingRangeEnd = lastCodepoint;
  1851. }
  1852. //If we get here, and pendingRangeEnd is set. Then one of the above has caused it to be set, or the previous iteration of the loop.
  1853. if (pendingRangeEnd != INVALID_CODEPOINT)
  1854. {
  1855. char16 leftSingleChar, rightSingleChar, ignore;
  1856. if (pendingRangeStart >= 0x10000)
  1857. {
  1858. Js::NumberUtilities::CodePointAsSurrogatePair(pendingRangeStart, &ignore, &leftSingleChar);
  1859. }
  1860. else
  1861. {
  1862. leftSingleChar = (char16)pendingRangeStart;
  1863. }
  1864. if (pendingRangeEnd >= 0x10000)
  1865. {
  1866. Js::NumberUtilities::CodePointAsSurrogatePair(pendingRangeEnd, &rightSingleChar, &ignore);
  1867. }
  1868. else
  1869. {
  1870. rightSingleChar = (char16)pendingRangeEnd;
  1871. }
  1872. // Here it is a bit tricky, we don't know if we have a unicode option specified.
  1873. // If it is, then \ud800\udc00 - \ud800\udc01 is valid, otherwise invalid.
  1874. if (pendingRangeStart < 0x10000 && pendingRangeEnd < 0x10000 && pendingRangeStart > pendingRangeEnd)
  1875. {
  1876. Fail(JSERR_RegExpBadRange);
  1877. }
  1878. else
  1879. {
  1880. if(leftSingleChar > rightSingleChar)
  1881. {
  1882. DeferredFailIfNotUnicode(JSERR_RegExpBadRange);
  1883. }
  1884. if (pendingRangeStart > pendingRangeEnd)
  1885. {
  1886. DeferredFailIfUnicode(JSERR_RegExpBadRange);
  1887. }
  1888. }
  1889. pendingRangeStart = pendingRangeEnd = INVALID_CODEPOINT;
  1890. }
  1891. // The current char < 0x10000 is a candidate for the range end, but we need to iterate one more time.
  1892. else
  1893. {
  1894. pendingRangeEnd = lastCodepoint;
  1895. }
  1896. }
  1897. nextChar = ECLookahead();
  1898. }
  1899. // We should never have a pendingRangeEnd set when we exit the loop
  1900. Assert(pendingRangeEnd == INVALID_CODEPOINT);
  1901. }
  1902. template <typename P, const bool IsLiteral>
  1903. template <bool containsSurrogates>
  1904. Node* Parser<P, IsLiteral>::CharacterClassPass1()
  1905. {
  1906. Assert(containsSurrogates ? unicodeFlagPresent : true);
  1907. CharSet<codepoint_t> codePointSet;
  1908. MatchSetNode deferredSetNode(false, false);
  1909. MatchCharNode deferredCharNode(0);
  1910. bool isNegation = false;
  1911. if (ECLookahead() == '^')
  1912. {
  1913. isNegation = true;
  1914. ECConsume();
  1915. }
  1916. // We aren't expecting any terminating null characters, only embedded ones that should treated as valid characters.
  1917. // CharacterClassPass0 should have taken care of terminating null.
  1918. codepoint_t pendingCodePoint = INVALID_CODEPOINT;
  1919. codepoint_t pendingRangeStart = INVALID_CODEPOINT;
  1920. EncodedChar nextChar = ECLookahead();
  1921. bool previousWasASurrogate = false;
  1922. while(nextChar != ']')
  1923. {
  1924. codepoint_t codePointToSet = INVALID_CODEPOINT;
  1925. // Consume ahead of time if we have two backslashes, both cases below (previously Tracked surrogate pair, and ClassEscapePass1) assume it is.
  1926. if (nextChar == '\\')
  1927. {
  1928. ECConsume();
  1929. }
  1930. // These if-blocks are the logical ClassAtomPass1, they weren't grouped into a method to simplify dealing with multiple out parameters.
  1931. if (containsSurrogates && this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
  1932. {
  1933. codePointToSet = pendingCodePoint;
  1934. pendingCodePoint = this->currentSurrogatePairNode->value;
  1935. Assert(ECCanConsume(this->currentSurrogatePairNode->length));
  1936. ECConsumeMultiUnit(this->currentSurrogatePairNode->length);
  1937. RestoreMultiUnits(this->currentSurrogatePairNode->multiUnits);
  1938. this->currentSurrogatePairNode = this->currentSurrogatePairNode->next;
  1939. }
  1940. else if (nextChar == '\\')
  1941. {
  1942. Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);
  1943. if (returnedNode->tag == Node::MatchSet)
  1944. {
  1945. codePointToSet = pendingCodePoint;
  1946. pendingCodePoint = INVALID_CODEPOINT;
  1947. if (pendingRangeStart != INVALID_CODEPOINT)
  1948. {
  1949. codePointSet.Set(ctAllocator, '-');
  1950. }
  1951. pendingRangeStart = INVALID_CODEPOINT;
  1952. codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);
  1953. }
  1954. else
  1955. {
  1956. // Just a character
  1957. codePointToSet = pendingCodePoint;
  1958. pendingCodePoint = deferredCharNode.cs[0];
  1959. }
  1960. }
  1961. else if (nextChar == '-')
  1962. {
  1963. if (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT || ECLookahead(1) == ']')
  1964. {
  1965. // - is just a char, or end of a range.
  1966. codePointToSet = pendingCodePoint;
  1967. pendingCodePoint = '-';
  1968. ECConsume();
  1969. }
  1970. else
  1971. {
  1972. pendingRangeStart = pendingCodePoint;
  1973. ECConsume();
  1974. }
  1975. }
  1976. else
  1977. {
  1978. // Just a character, consume it
  1979. codePointToSet = pendingCodePoint;
  1980. pendingCodePoint = NextChar();
  1981. }
  1982. if (codePointToSet != INVALID_CODEPOINT)
  1983. {
  1984. if (pendingRangeStart != INVALID_CODEPOINT)
  1985. {
  1986. if (pendingRangeStart > pendingCodePoint)
  1987. {
  1988. //We have no unicodeFlag, but current range contains surrogates, thus we may end up having to throw a "Syntax" error here
  1989. //This breaks the notion of Pass0 check for valid syntax, because we don't know if we have a unicode option
  1990. Assert(!unicodeFlagPresent);
  1991. Fail(JSERR_RegExpBadRange);
  1992. }
  1993. codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
  1994. pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
  1995. }
  1996. else
  1997. {
  1998. codePointSet.Set(ctAllocator, codePointToSet);
  1999. }
  2000. }
  2001. nextChar = ECLookahead();
  2002. }
  2003. if (pendingCodePoint != INVALID_CODEPOINT)
  2004. {
  2005. codePointSet.Set(ctAllocator, pendingCodePoint);
  2006. }
  2007. // At this point, we have a complete set of codepoints representing the range.
  2008. // Before performing translation of any kind, we need to do some case filling.
  2009. // At the point of this comment, there are no case mappings going cross-plane between simple
  2010. // characters (< 0x10000) and supplementary characters (>= 0x10000)
  2011. // However, it might still be the case, and this has to be handled.
  2012. // On the other hand, we don't want to prevent optimizations that expect non-casefolded sets from happening.
  2013. // At least for simple characters.
  2014. // The simple case, is when the unicode flag isn't specified, we can go ahead and return the simple set.
  2015. // Negations and case mappings will be handled later.
  2016. if (!unicodeFlagPresent)
  2017. {
  2018. Assert(codePointSet.SimpleCharCount() == codePointSet.Count());
  2019. MatchSetNode *simpleToReturn = Anew(ctAllocator, MatchSetNode, isNegation);
  2020. codePointSet.CloneSimpleCharsTo(ctAllocator, simpleToReturn->set);
  2021. return simpleToReturn;
  2022. }
  2023. // Everything past here must be under the flag
  2024. Assert(scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  2025. if (codePointSet.IsEmpty())
  2026. {
  2027. return Anew(ctAllocator, MatchSetNode, false, false);
  2028. }
  2029. Node* prefixNode = nullptr;
  2030. Node* suffixNode = nullptr;
  2031. CharSet<codepoint_t> *toUseForTranslation = &codePointSet;
  2032. // If a singleton, return a simple character
  2033. bool isSingleton = !this->caseInsensitiveFlagPresent && !isNegation && codePointSet.IsSingleton();
  2034. if (isSingleton)
  2035. {
  2036. codepoint_t singleton = codePointSet.Singleton();
  2037. Node* toReturn = nullptr;
  2038. if (singleton < 0x10000)
  2039. {
  2040. toReturn = Anew(ctAllocator, MatchCharNode, (char16)singleton);
  2041. }
  2042. else
  2043. {
  2044. Assert(unicodeFlagPresent);
  2045. char16 lowerSurrogate, upperSurrogate;
  2046. Js::NumberUtilities::CodePointAsSurrogatePair(singleton, &lowerSurrogate, &upperSurrogate);
  2047. toReturn = CreateSurrogatePairAtom(lowerSurrogate, upperSurrogate);
  2048. }
  2049. codePointSet.Clear(ctAllocator);
  2050. return toReturn;
  2051. }
  2052. if (!this->caseInsensitiveFlagPresent)
  2053. {
  2054. // If negation, we want to complement the simple chars.
  2055. // When a set is negated, optimizations skip checking if applicable, so we can go ahead and negate it here.
  2056. CharSet<codepoint_t> negatedSet;
  2057. if (isNegation)
  2058. {
  2059. // Complement all characters, and use it as the set toTranslate
  2060. codePointSet.ToComplement(ctAllocator, negatedSet);
  2061. }
  2062. toUseForTranslation = isNegation ? &negatedSet : &codePointSet;
  2063. if (isNegation)
  2064. {
  2065. // Clear this, as we will no longer need this.
  2066. codePointSet.FreeBody(ctAllocator);
  2067. }
  2068. }
  2069. else
  2070. {
  2071. CharSet<codepoint_t> caseEquivalent;
  2072. codePointSet.ToEquivClass(ctAllocator, caseEquivalent);
  2073. // Equiv set can't have a reduced count of chars
  2074. Assert(caseEquivalent.Count() >= codePointSet.Count());
  2075. // Here we have a regex that has both case insensitive and unicode options.
  2076. // The range might also be negated. If it is negated, we can go ahead and negate
  2077. // the entire set as well as fill in cases, as optimizations wouldn't kick in anyways.
  2078. if (isNegation)
  2079. {
  2080. codePointSet.Clear(ctAllocator);
  2081. caseEquivalent.ToComplement(ctAllocator, codePointSet);
  2082. caseEquivalent.FreeBody(ctAllocator);
  2083. }
  2084. else
  2085. {
  2086. codePointSet.CloneFrom(ctAllocator, caseEquivalent);
  2087. }
  2088. Assert(toUseForTranslation == &codePointSet);
  2089. }
  2090. uint totalCodePointsCount = toUseForTranslation->Count();
  2091. uint simpleCharsCount = toUseForTranslation->SimpleCharCount();
  2092. if (totalCodePointsCount == simpleCharsCount)
  2093. {
  2094. MatchSetNode *simpleToReturn = Anew(ctAllocator, MatchSetNode, isNegation);
  2095. toUseForTranslation->CloneSimpleCharsTo(ctAllocator, simpleToReturn->set);
  2096. return simpleToReturn;
  2097. }
  2098. if (simpleCharsCount > 0)
  2099. {
  2100. if (!toUseForTranslation->ContainSurrogateCodeUnits())
  2101. {
  2102. MatchSetNode *node = Anew(ctAllocator, MatchSetNode, false, false);
  2103. toUseForTranslation->CloneSimpleCharsTo(ctAllocator, node->set);
  2104. prefixNode = node;
  2105. }
  2106. else
  2107. {
  2108. MatchSetNode *node = Anew(ctAllocator, MatchSetNode, false, false);
  2109. toUseForTranslation->CloneNonSurrogateCodeUnitsTo(ctAllocator, node->set);
  2110. prefixNode = node;
  2111. node = Anew(ctAllocator, MatchSetNode, false, false);
  2112. toUseForTranslation->CloneSurrogateCodeUnitsTo(ctAllocator, node->set);
  2113. suffixNode = node;
  2114. }
  2115. }
  2116. Assert(unicodeFlagPresent);
  2117. AltNode *headToReturn = prefixNode == nullptr ? nullptr : Anew(ctAllocator, AltNode, prefixNode, nullptr);
  2118. AltNode *currentTail = headToReturn;
  2119. codepoint_t charRangeSearchIndex = 0x10000, lowerCharOfRange = 0, upperCharOfRange = 0;
  2120. while (toUseForTranslation->GetNextRange(charRangeSearchIndex, &lowerCharOfRange, &upperCharOfRange))
  2121. {
  2122. if (lowerCharOfRange == upperCharOfRange)
  2123. {
  2124. currentTail = this->AppendSurrogatePairToDisjunction(lowerCharOfRange, currentTail);
  2125. }
  2126. else
  2127. {
  2128. currentTail = this->AppendSurrogateRangeToDisjunction(lowerCharOfRange, upperCharOfRange, currentTail);
  2129. }
  2130. if (headToReturn == nullptr)
  2131. {
  2132. headToReturn = currentTail;
  2133. }
  2134. AnalysisAssert(currentTail != nullptr);
  2135. while (currentTail->tail != nullptr)
  2136. {
  2137. currentTail = currentTail->tail;
  2138. }
  2139. charRangeSearchIndex = upperCharOfRange + 1;
  2140. }
  2141. if (suffixNode != nullptr)
  2142. {
  2143. currentTail->tail = Anew(ctAllocator, AltNode, suffixNode, nullptr);
  2144. }
  2145. toUseForTranslation->Clear(ctAllocator);
  2146. if (headToReturn != nullptr && headToReturn->tail == nullptr)
  2147. {
  2148. return headToReturn->head;
  2149. }
  2150. return headToReturn;
  2151. }
  2152. #pragma warning(push)
  2153. #pragma warning(disable:4702) // unreachable code
  2154. template <typename P, const bool IsLiteral>
  2155. bool Parser<P, IsLiteral>::ClassEscapePass0(Char& singleton, bool& previousSurrogatePart)
  2156. {
  2157. // Could be terminating 0
  2158. EncodedChar ec = ECLookahead();
  2159. if (ec == 0 && IsEOF())
  2160. {
  2161. Fail(JSERR_RegExpSyntax);
  2162. return false;
  2163. }
  2164. else if (standardEncodedChars->IsOctal(ec))
  2165. {
  2166. uint n = 0;
  2167. CharCount digits = 0;
  2168. do
  2169. {
  2170. uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
  2171. if (m > Chars<uint8>::MaxUChar) //Regex octal codes only support single byte (ASCII) characters.
  2172. break;
  2173. n = m;
  2174. ECConsume();
  2175. digits++;
  2176. }
  2177. while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not octal
  2178. singleton = UTC((UChar)n);
  2179. // Clear possible pair
  2180. this->tempLocationOfSurrogatePair = nullptr;
  2181. return true;
  2182. }
  2183. else
  2184. {
  2185. const EncodedChar* location = this->tempLocationOfSurrogatePair;
  2186. // Clear it for now, otherwise to many branches to clear it on.
  2187. this->tempLocationOfSurrogatePair = nullptr;
  2188. // An escaped '/' is ok
  2189. Char c = NextChar();
  2190. switch (c)
  2191. {
  2192. case 'b':
  2193. singleton = '\b';
  2194. return true;
  2195. case 'f':
  2196. singleton = '\f';
  2197. return true;
  2198. case 'n':
  2199. singleton = '\n';
  2200. return true;
  2201. case 'r':
  2202. singleton = '\r';
  2203. return true;
  2204. case 't':
  2205. singleton = '\t';
  2206. return true;
  2207. case 'v':
  2208. singleton = '\v';
  2209. return true;
  2210. case 'd':
  2211. case 'D':
  2212. case 's':
  2213. case 'S':
  2214. case 'w':
  2215. case 'W':
  2216. return false;
  2217. case 'c':
  2218. if (standardEncodedChars->IsLetter(ECLookahead())) // terminating 0 is not a letter
  2219. {
  2220. singleton = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
  2221. ECConsume();
  2222. }
  2223. else
  2224. {
  2225. if (!IsEOF())
  2226. {
  2227. EncodedChar ec = ECLookahead();
  2228. switch (ec)
  2229. {
  2230. case '-':
  2231. case ']':
  2232. singleton = c;
  2233. break;
  2234. default:
  2235. singleton = UTC(Chars<EncodedChar>::CTU(ec) % 32);
  2236. ECConsume();
  2237. break;
  2238. }
  2239. }
  2240. else
  2241. singleton = c;
  2242. }
  2243. return true;
  2244. case 'x':
  2245. if (ECCanConsume(2) &&
  2246. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2247. standardEncodedChars->IsHex(ECLookahead(1)))
  2248. {
  2249. singleton = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
  2250. (standardEncodedChars->DigitValue(ECLookahead(1))));
  2251. ECConsume(2);
  2252. }
  2253. else
  2254. singleton = c;
  2255. return true;
  2256. case 'u':
  2257. this->tempLocationOfSurrogatePair = location;
  2258. if (this->TryParseExtendedUnicodeEscape(singleton, previousSurrogatePart, true) > 0)
  2259. return true;
  2260. else if (ECCanConsume(4) &&
  2261. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2262. standardEncodedChars->IsHex(ECLookahead(1)) &&
  2263. standardEncodedChars->IsHex(ECLookahead(2)) &&
  2264. standardEncodedChars->IsHex(ECLookahead(3)))
  2265. {
  2266. singleton = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  2267. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  2268. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  2269. (standardEncodedChars->DigitValue(ECLookahead(3))));
  2270. if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  2271. {
  2272. // Current location
  2273. TrackIfSurrogatePair(singleton, (next - 1), 5);
  2274. }
  2275. // The above if statement, if true, will clear tempLocationOfSurrogatePair if needs to.
  2276. ECConsume(4);
  2277. }
  2278. else
  2279. singleton = c;
  2280. return true;
  2281. default:
  2282. // embedded 0 is ok
  2283. singleton = c;
  2284. return true;
  2285. }
  2286. }
  2287. }
  2288. #pragma warning(pop)
  2289. template <typename P, const bool IsLiteral>
  2290. Node* Parser<P, IsLiteral>::ClassEscapePass1(MatchCharNode* deferredCharNode, MatchSetNode* deferredSetNode, bool& previousSurrogatePart)
  2291. {
  2292. // Checked for terminating 0 is pass 0
  2293. Assert(!IsEOF());
  2294. if (standardEncodedChars->IsOctal(ECLookahead()))
  2295. {
  2296. // As per Annex B, allow octal escapes instead of just \0 (and \8 and \9 are identity escapes).
  2297. // Must be between 1 and 3 octal digits.
  2298. uint n = 0;
  2299. CharCount digits = 0;
  2300. do
  2301. {
  2302. uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
  2303. if (m > Chars<uint8>::MaxUChar) //Regex octal codes only support single byte (ASCII) characters.
  2304. break;
  2305. n = m;
  2306. ECConsume();
  2307. digits++;
  2308. }
  2309. while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not octal
  2310. deferredCharNode->cs[0] = UTC((UChar)n);
  2311. return deferredCharNode;
  2312. }
  2313. else
  2314. {
  2315. Char c = NextChar();
  2316. switch (c)
  2317. {
  2318. case 'b':
  2319. c = '\b';
  2320. break; // fall-through for identity escape
  2321. case 'f':
  2322. c = '\f';
  2323. break; // fall-through for identity escape
  2324. case 'n':
  2325. c = '\n';
  2326. break; // fall-through for identity escape
  2327. case 'r':
  2328. c = '\r';
  2329. break; // fall-through for identity escape
  2330. case 't':
  2331. c = '\t';
  2332. break; // fall-through for identity escape
  2333. case 'v':
  2334. c = '\v';
  2335. break; // fall-through for identity escape
  2336. case 'd':
  2337. standardChars->SetDigits(ctAllocator, deferredSetNode->set);
  2338. return deferredSetNode;
  2339. case 'D':
  2340. standardChars->SetNonDigits(ctAllocator, deferredSetNode->set);
  2341. return deferredSetNode;
  2342. case 's':
  2343. standardChars->SetWhitespace(ctAllocator, deferredSetNode->set);
  2344. return deferredSetNode;
  2345. case 'S':
  2346. standardChars->SetNonWhitespace(ctAllocator, deferredSetNode->set);
  2347. return deferredSetNode;
  2348. case 'w':
  2349. standardChars->SetWordChars(ctAllocator, deferredSetNode->set);
  2350. return deferredSetNode;
  2351. case 'W':
  2352. standardChars->SetNonWordChars(ctAllocator, deferredSetNode->set);
  2353. return deferredSetNode;
  2354. case 'c':
  2355. if (standardEncodedChars->IsLetter(ECLookahead())) // terminating 0 is not a letter
  2356. {
  2357. c = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
  2358. ECConsume();
  2359. // fall-through for identity escape
  2360. }
  2361. else
  2362. {
  2363. // SPEC DEVIATION: For non-letters, still take lower 5 bits, e.g. [\c1] == [\x11].
  2364. // However, '-', ']', and EOF make the \c just a 'c'.
  2365. if (!IsEOF())
  2366. {
  2367. EncodedChar ec = ECLookahead();
  2368. switch (ec)
  2369. {
  2370. case '-':
  2371. case ']':
  2372. // fall-through for identity escape with 'c'
  2373. break;
  2374. default:
  2375. c = UTC(Chars<EncodedChar>::CTU(ec) % 32);
  2376. ECConsume();
  2377. // fall-through for identity escape
  2378. break;
  2379. }
  2380. }
  2381. // else: fall-through for identity escape with 'c'
  2382. }
  2383. break;
  2384. case 'x':
  2385. if (ECCanConsume(2) &&
  2386. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2387. standardEncodedChars->IsHex(ECLookahead(1)))
  2388. {
  2389. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
  2390. (standardEncodedChars->DigitValue(ECLookahead(1))));
  2391. ECConsume(2);
  2392. // fall-through for identity escape
  2393. }
  2394. // Take to be identity escape if ill-formed as per Annex B
  2395. break;
  2396. case 'u':
  2397. if (unicodeFlagPresent && TryParseExtendedUnicodeEscape(c, previousSurrogatePart) > 0)
  2398. break;
  2399. else if (ECCanConsume(4) &&
  2400. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2401. standardEncodedChars->IsHex(ECLookahead(1)) &&
  2402. standardEncodedChars->IsHex(ECLookahead(2)) &&
  2403. standardEncodedChars->IsHex(ECLookahead(3)))
  2404. {
  2405. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  2406. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  2407. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  2408. (standardEncodedChars->DigitValue(ECLookahead(3))));
  2409. ECConsume(4);
  2410. // fall-through for identity escape
  2411. }
  2412. // Take to be identity escape if ill-formed as per Annex B.
  2413. break;
  2414. default:
  2415. // As per Annex B, allow anything other than newlines and above. Embedded 0 is ok.
  2416. break;
  2417. }
  2418. // Must be an identity escape
  2419. deferredCharNode->cs[0] = c;
  2420. return deferredCharNode;
  2421. }
  2422. }
  2423. //
  2424. // Options
  2425. //
  2426. template <typename P, const bool IsLiteral>
  2427. void Parser<P, IsLiteral>::Options(RegexFlags& flags)
  2428. {
  2429. while (true)
  2430. {
  2431. // Could be terminating 0
  2432. EncodedChar ec = ECLookahead();
  2433. CharCount consume;
  2434. Char c;
  2435. if (ec == 0)
  2436. // Embedded 0 not valid
  2437. return;
  2438. else if (IsLiteral &&
  2439. ec == '\\' &&
  2440. ECCanConsume(6) &&
  2441. ECLookahead(1) == 'u' &&
  2442. standardEncodedChars->IsHex(ECLookahead(2)) &&
  2443. standardEncodedChars->IsHex(ECLookahead(3)) &&
  2444. standardEncodedChars->IsHex(ECLookahead(4)) &&
  2445. standardEncodedChars->IsHex(ECLookahead(5)))
  2446. {
  2447. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  2448. {
  2449. Fail(JSERR_RegExpSyntax);
  2450. return;
  2451. }
  2452. else
  2453. {
  2454. uint32 n = (standardEncodedChars->DigitValue(ECLookahead(2)) << 12) |
  2455. (standardEncodedChars->DigitValue(ECLookahead(3)) << 8) |
  2456. (standardEncodedChars->DigitValue(ECLookahead(4)) << 4) |
  2457. (standardEncodedChars->DigitValue(ECLookahead(5)));
  2458. c = UTC(n);
  2459. consume = 6;
  2460. }
  2461. }
  2462. else
  2463. {
  2464. c = Chars<EncodedChar>::CTW(ec);
  2465. consume = 1;
  2466. }
  2467. switch (c) {
  2468. case 'i':
  2469. if ((flags & IgnoreCaseRegexFlag) != 0)
  2470. {
  2471. Fail(JSERR_RegExpSyntax);
  2472. }
  2473. flags = (RegexFlags)(flags | IgnoreCaseRegexFlag);
  2474. break;
  2475. case 'g':
  2476. if ((flags & GlobalRegexFlag) != 0)
  2477. {
  2478. Fail(JSERR_RegExpSyntax);
  2479. }
  2480. flags = (RegexFlags)(flags | GlobalRegexFlag);
  2481. break;
  2482. case 'm':
  2483. if ((flags & MultilineRegexFlag) != 0)
  2484. {
  2485. Fail(JSERR_RegExpSyntax);
  2486. }
  2487. flags = (RegexFlags)(flags | MultilineRegexFlag);
  2488. break;
  2489. case 'u':
  2490. // If we don't have unicode enabled, fall through to default
  2491. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  2492. {
  2493. if ((flags & UnicodeRegexFlag) != 0)
  2494. {
  2495. Fail(JSERR_RegExpSyntax);
  2496. }
  2497. flags = (RegexFlags)(flags | UnicodeRegexFlag);
  2498. // For telemetry
  2499. CHAKRATEL_LANGSTATS_INC_LANGFEATURECOUNT(UnicodeRegexFlagCount, scriptContext);
  2500. break;
  2501. }
  2502. case 'y':
  2503. if (scriptContext->GetConfig()->IsES6RegExStickyEnabled())
  2504. {
  2505. if ((flags & StickyRegexFlag) != 0)
  2506. {
  2507. Fail(JSERR_RegExpSyntax);
  2508. }
  2509. flags = (RegexFlags)(flags | StickyRegexFlag);
  2510. // For telemetry
  2511. CHAKRATEL_LANGSTATS_INC_LANGFEATURECOUNT(StickyRegexFlagCount, scriptContext);
  2512. break;
  2513. }
  2514. default:
  2515. if (standardChars->IsWord(c))
  2516. {
  2517. // Outer context could never parse this character. Signal the syntax error as
  2518. // being part of the regex.
  2519. Fail(JSERR_RegExpSyntax);
  2520. }
  2521. return;
  2522. }
  2523. ECConsume(consume);
  2524. }
  2525. }
  2526. //
  2527. // Entry points
  2528. //
  2529. template <typename P, const bool IsLiteral>
  2530. Node* Parser<P, IsLiteral>::ParseDynamic
  2531. ( const EncodedChar* body
  2532. , const EncodedChar* bodyLim
  2533. , const EncodedChar* opts
  2534. , const EncodedChar* optsLim
  2535. , RegexFlags& flags )
  2536. {
  2537. Assert(!IsLiteral);
  2538. Assert(body != 0);
  2539. Assert(bodyLim >= body && *bodyLim == 0);
  2540. Assert(opts == 0 || (optsLim >= opts && *optsLim == 0));
  2541. // Body, pass 0
  2542. SetPosition(body, bodyLim, true);
  2543. PatternPass0();
  2544. if (!IsEOF())
  2545. Fail(JSERR_RegExpSyntax);
  2546. // Options
  2547. if (opts != 0)
  2548. {
  2549. SetPosition(opts, optsLim, false);
  2550. Options(flags);
  2551. if (!IsEOF())
  2552. Fail(JSERR_RegExpSyntax);
  2553. this->unicodeFlagPresent = (flags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
  2554. this->caseInsensitiveFlagPresent = (flags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
  2555. Assert(!this->unicodeFlagPresent || scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  2556. }
  2557. else
  2558. {
  2559. this->unicodeFlagPresent = false;
  2560. this->caseInsensitiveFlagPresent = false;
  2561. }
  2562. // If this HR has been set, that means we have an earlier failure than the one caught above.
  2563. if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
  2564. {
  2565. throw ParseError(*deferredIfNotUnicodeError);
  2566. }
  2567. else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
  2568. {
  2569. throw ParseError(*deferredIfUnicodeError);
  2570. }
  2571. this->currentSurrogatePairNode = this->surrogatePairList;
  2572. // Body, pass 1
  2573. SetPosition(body, bodyLim, true);
  2574. Node* root = PatternPass1();
  2575. Assert(IsEOF());
  2576. return root;
  2577. }
  2578. template <typename P, const bool IsLiteral>
  2579. Node* Parser<P, IsLiteral>::ParseLiteral
  2580. ( const EncodedChar* input
  2581. , const EncodedChar* inputLim
  2582. , CharCount& outBodyEncodedChars
  2583. , CharCount& outTotalEncodedChars
  2584. , CharCount& outBodyChars
  2585. , CharCount& outTotalChars
  2586. , RegexFlags& flags )
  2587. {
  2588. Assert(IsLiteral);
  2589. Assert(input != 0);
  2590. Assert(inputLim >= input); // *inputLim need not be 0 because of deferred parsing
  2591. // To handle surrogate pairs properly under unicode option, we will collect information on location of the pairs
  2592. // during pass 0, regardless if the option is present. (We aren't able to get it at that time)
  2593. // During pass 1, we will use that information to correctly create appropriate nodes.
  2594. // Body, pass 0
  2595. SetPosition(input, inputLim, true);
  2596. PatternPass0();
  2597. outBodyEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2598. outBodyChars = Pos();
  2599. // Options are needed for the next pass
  2600. ECMust('/', ERRnoSlash);
  2601. Options(flags);
  2602. this->unicodeFlagPresent = (flags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
  2603. this->caseInsensitiveFlagPresent = (flags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
  2604. Assert(!this->unicodeFlagPresent || scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  2605. // If this HR has been set, that means we have an earlier failure than the one caught above.
  2606. if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
  2607. {
  2608. throw ParseError(*deferredIfNotUnicodeError);
  2609. }
  2610. else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
  2611. {
  2612. throw ParseError(*deferredIfUnicodeError);
  2613. }
  2614. // Used below to proceed to the end of the regex
  2615. const EncodedChar *pastOptions = next;
  2616. this->currentSurrogatePairNode = this->surrogatePairList;
  2617. // Body, pass 1
  2618. SetPosition(input, inputLim, true);
  2619. Node* root = PatternPass1();
  2620. Assert(outBodyEncodedChars == Chars<EncodedChar>::OSB(next, input));
  2621. Assert(outBodyChars == Pos());
  2622. next = pastOptions;
  2623. outTotalEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2624. outTotalChars = Pos();
  2625. return root;
  2626. }
  2627. template <typename P, const bool IsLiteral>
  2628. void Parser<P, IsLiteral>::ParseLiteralNoAST
  2629. ( const EncodedChar* input
  2630. , const EncodedChar* inputLim
  2631. , CharCount& outBodyEncodedChars
  2632. , CharCount& outTotalEncodedChars
  2633. , CharCount& outBodyChars
  2634. , CharCount& outTotalChars )
  2635. {
  2636. Assert(IsLiteral);
  2637. Assert(input != 0);
  2638. Assert(inputLim >= input); // *inputLim need not be 0 because of deferred parsing
  2639. // Body, pass 0
  2640. SetPosition(input, inputLim, true);
  2641. PatternPass0();
  2642. outBodyEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2643. outBodyChars = Pos();
  2644. // Options
  2645. ECMust('/', ERRnoSlash);
  2646. RegexFlags dummyFlags = NoRegexFlags;
  2647. Options(dummyFlags);
  2648. this->unicodeFlagPresent = (dummyFlags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
  2649. this->caseInsensitiveFlagPresent = (dummyFlags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
  2650. outTotalEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2651. outTotalChars = Pos();
  2652. // If this HR has been set, that means we have an earlier failure than the one caught above.
  2653. if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
  2654. {
  2655. throw ParseError(*deferredIfNotUnicodeError);
  2656. }
  2657. else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
  2658. {
  2659. throw ParseError(*deferredIfUnicodeError);
  2660. }
  2661. }
  2662. template <typename P, const bool IsLiteral>
  2663. template <const bool buildAST>
  2664. RegexPattern * Parser<P, IsLiteral>::CompileProgram
  2665. ( Node* root,
  2666. const EncodedChar*& currentCharacter,
  2667. const CharCount totalLen,
  2668. const CharCount bodyChars,
  2669. const CharCount totalChars,
  2670. const RegexFlags flags )
  2671. {
  2672. Assert(IsLiteral);
  2673. Program* program = nullptr;
  2674. if (buildAST)
  2675. {
  2676. const auto recycler = this->scriptContext->GetRecycler();
  2677. program = Program::New(recycler, flags);
  2678. this->CaptureSourceAndGroups(recycler, program, currentCharacter, bodyChars);
  2679. }
  2680. currentCharacter += totalLen;
  2681. Assert(GetMultiUnits() == totalLen - totalChars);
  2682. if (!buildAST)
  2683. {
  2684. return nullptr;
  2685. }
  2686. RegexPattern* pattern = RegexPattern::New(this->scriptContext, program, true);
  2687. #if ENABLE_REGEX_CONFIG_OPTIONS
  2688. RegexStats* stats = 0;
  2689. if (REGEX_CONFIG_FLAG(RegexProfile))
  2690. {
  2691. stats = this->scriptContext->GetRegexStatsDatabase()->GetRegexStats(pattern);
  2692. this->scriptContext->GetRegexStatsDatabase()->EndProfile(stats, RegexStats::Parse);
  2693. }
  2694. if (REGEX_CONFIG_FLAG(RegexTracing))
  2695. {
  2696. DebugWriter* tw = this->scriptContext->GetRegexDebugWriter();
  2697. tw->Print(_u("// REGEX COMPILE "));
  2698. pattern->Print(tw);
  2699. tw->EOL();
  2700. }
  2701. if (REGEX_CONFIG_FLAG(RegexProfile))
  2702. this->scriptContext->GetRegexStatsDatabase()->BeginProfile();
  2703. #endif
  2704. ArenaAllocator* rtAllocator = this->scriptContext->RegexAllocator();
  2705. Compiler::Compile
  2706. ( this->scriptContext
  2707. , ctAllocator
  2708. , rtAllocator
  2709. , standardChars
  2710. , program
  2711. , root
  2712. , this->GetLitbuf()
  2713. , pattern
  2714. #if ENABLE_REGEX_CONFIG_OPTIONS
  2715. , w
  2716. , stats
  2717. #endif
  2718. );
  2719. #if ENABLE_REGEX_CONFIG_OPTIONS
  2720. if (REGEX_CONFIG_FLAG(RegexProfile))
  2721. this->scriptContext->GetRegexStatsDatabase()->EndProfile(stats, RegexStats::Compile);
  2722. #endif
  2723. #ifdef PROFILE_EXEC
  2724. this->scriptContext->ProfileEnd(Js::RegexCompilePhase);
  2725. #endif
  2726. return pattern;
  2727. }
  2728. template <typename P, const bool IsLiteral>
  2729. void Parser<P, IsLiteral>::CaptureEmptySourceAndNoGroups(Program* program)
  2730. {
  2731. Assert(program->source == 0);
  2732. program->source = _u("");
  2733. program->sourceLen = 0;
  2734. program->numGroups = 1;
  2735. // Remaining to set during compilation: litbuf, litbufLen, numLoops, insts, instsLen, entryPointLabel
  2736. }
  2737. template <typename P, const bool IsLiteral>
  2738. void Parser<P, IsLiteral>::CaptureSourceAndGroups(Recycler* recycler, Program* program, const EncodedChar* body, CharCount bodyChars)
  2739. {
  2740. Assert(program->source == 0);
  2741. Assert(body != 0);
  2742. // Program will own source string
  2743. program->source = RecyclerNewArrayLeaf(recycler, Char, bodyChars + 1);
  2744. // Don't need to zero out since we're writing to the buffer right here
  2745. ConvertToUnicode(program->source, bodyChars, body);
  2746. program->source[bodyChars] = 0;
  2747. program->sourceLen = bodyChars;
  2748. program->numGroups = nextGroupId;
  2749. // Remaining to set during compilation: litbuf, litbufLen, numLoops, insts, instsLen, entryPointLabel
  2750. }
  2751. template <typename P, const bool IsLiteral>
  2752. Node* Parser<P, IsLiteral>::GetNodeWithValidCharacterSet(EncodedChar cc)
  2753. {
  2754. Node* nodeToReturn = nullptr;
  2755. if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() && this->unicodeFlagPresent)
  2756. {
  2757. MatchSetNode *lowerRangeNode = Anew(ctAllocator, MatchSetNode, false, false);
  2758. lowerRangeNode->set.SetRange(ctAllocator, (Char)0xD800, (Char)0xDBFF);
  2759. MatchSetNode *upperRangeNode = Anew(ctAllocator, MatchSetNode, false, false);
  2760. upperRangeNode->set.SetRange(ctAllocator, (Char)0xDC00, (Char)0xDFFF);
  2761. ConcatNode* surrogateRangePairNode = Anew(ctAllocator, ConcatNode, lowerRangeNode, Anew(ctAllocator, ConcatNode, upperRangeNode, nullptr));
  2762. // The MatchSet node will be split into [0-D7FFDC00-FFFF] (minus special characters like newline, whitespace, etc.) as a prefix, and a suffix of [D800-DBFF]
  2763. // i.e. The MatchSet node can be with [0-D7FFDC00-FFFF] (minus special characters like newline, whitespace, etc.) OR [D800-DBFF]
  2764. MatchSetNode* partialPrefixSetNode = Anew(ctAllocator, MatchSetNode, false, false);
  2765. switch (cc)
  2766. {
  2767. case '.':
  2768. standardChars->SetNonNewline(ctAllocator, partialPrefixSetNode->set);
  2769. break;
  2770. case 'S':
  2771. standardChars->SetNonWhitespace(ctAllocator, partialPrefixSetNode->set);
  2772. break;
  2773. case 'D':
  2774. standardChars->SetNonDigits(ctAllocator, partialPrefixSetNode->set);
  2775. break;
  2776. case 'W':
  2777. standardChars->SetNonWordChars(ctAllocator, partialPrefixSetNode->set);
  2778. break;
  2779. default:
  2780. AssertMsg(false, "");
  2781. }
  2782. partialPrefixSetNode->set.SubtractRange(ctAllocator, (Char)0xD800u, (Char)0xDBFFu);
  2783. MatchSetNode* partialSuffixSetNode = Anew(ctAllocator, MatchSetNode, false, false);
  2784. partialSuffixSetNode->set.SetRange(ctAllocator, (Char)0xD800u, (Char)0xDBFFu);
  2785. AltNode* altNode = Anew(ctAllocator, AltNode, partialPrefixSetNode, Anew(ctAllocator, AltNode, surrogateRangePairNode, Anew(ctAllocator, AltNode, partialSuffixSetNode, nullptr)));
  2786. nodeToReturn = altNode;
  2787. }
  2788. else
  2789. {
  2790. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
  2791. switch (cc)
  2792. {
  2793. case '.':
  2794. standardChars->SetNonNewline(ctAllocator, setNode->set);
  2795. break;
  2796. case 'S':
  2797. standardChars->SetNonWhitespace(ctAllocator, setNode->set);
  2798. break;
  2799. case 'D':
  2800. standardChars->SetNonDigits(ctAllocator, setNode->set);
  2801. break;
  2802. case 'W':
  2803. standardChars->SetNonWordChars(ctAllocator, setNode->set);
  2804. break;
  2805. default:
  2806. AssertMsg(false, "");
  2807. }
  2808. nodeToReturn = setNode;
  2809. }
  2810. return nodeToReturn;
  2811. }
  2812. template <typename P, const bool IsLiteral>
  2813. void Parser<P, IsLiteral>::FreeBody()
  2814. {
  2815. if (litbuf != 0)
  2816. {
  2817. ctAllocator->Free(litbuf, litbufLen);
  2818. litbuf = 0;
  2819. litbufLen = 0;
  2820. litbufNext = 0;
  2821. }
  2822. }
  2823. //
  2824. // Template instantiation
  2825. //
  2826. template <typename P, const bool IsLiteral>
  2827. void UnifiedRegexParserForceInstantiation()
  2828. {
  2829. typedef typename P::EncodedChar EncodedChar;
  2830. Parser<P, IsLiteral> p
  2831. ( 0
  2832. , 0
  2833. , 0
  2834. , 0
  2835. , false
  2836. #if ENABLE_REGEX_CONFIG_OPTIONS
  2837. , 0
  2838. #endif
  2839. );
  2840. RegexFlags f;
  2841. CharCount a, b, c, d;
  2842. const EncodedChar* cp = 0;
  2843. p.ParseDynamic(0, 0, 0, 0, f);
  2844. p.ParseLiteral(0, 0, a, b, c, d, f);
  2845. p.ParseLiteralNoAST(0, 0, a, b, c, d);
  2846. p.CompileProgram<true>(0, cp, a, b, c, f);
  2847. p.CompileProgram<false>(0, cp, a, b, c, f);
  2848. p.CaptureEmptySourceAndNoGroups(0);
  2849. p.CaptureSourceAndGroups(0, 0, 0, 0);
  2850. p.FreeBody();
  2851. }
  2852. void UnifiedRegexParserForceAllInstantiations()
  2853. {
  2854. UnifiedRegexParserForceInstantiation<NullTerminatedUnicodeEncodingPolicy, false>();
  2855. UnifiedRegexParserForceInstantiation<NullTerminatedUnicodeEncodingPolicy, true>();
  2856. UnifiedRegexParserForceInstantiation<NullTerminatedUTF8EncodingPolicy, false>();
  2857. UnifiedRegexParserForceInstantiation<NullTerminatedUTF8EncodingPolicy, true>();
  2858. UnifiedRegexParserForceInstantiation<NotNullTerminatedUTF8EncodingPolicy, false>();
  2859. UnifiedRegexParserForceInstantiation<NotNullTerminatedUTF8EncodingPolicy, true>();
  2860. }
  2861. }