RegexParser.cpp 127 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. /*
  6. --------------------------------------------------------------------------------------------------------------------------------
  7. Original
  8. --------------------------------------------------------------------------------------------------------------------------------
  9. Pattern ::= Disjunction
  10. Disjunction ::= Alternative | Alternative '|' Disjunction
  11. Alternative ::= [empty] | Alternative Term
  12. Term ::= Assertion | Atom | Atom Quantifier
  13. Assertion ::= '^' | '$' | '\' 'b' | '\' 'B' | '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
  14. Quantifier ::= QuantifierPrefix | QuantifierPrefix '?'
  15. QuantifierPrefix ::= '*' | '+' | '?' | '{' DecimalDigits '}' | '{' DecimalDigits ',' '}' | '{' DecimalDigits ',' DecimalDigits '}'
  16. Atom ::= PatternCharacter | '.' | '\' AtomEscape | CharacterClass | '(' Disjunction ')' | '(' '?' ':' Disjunction ')'
  17. PatternCharacter ::= SourceCharacter but not any of { '^', '$', '\', '.', '*', '+', '?', '(', ')', '[', ']', '{', '}', '|' }
  18. AtomEscape ::= DecimalEscape | CharacterEscape | CharacterClassEscape
  19. CharacterEscape ::= ControlEscape | 'c' ControlLetter | HexEscapeSequence | UnicodeEscapeSequence | IdentityEscape
  20. ControlEscape ::= one of { 'f', 'n', 'r', 't', 'v' }
  21. ControlLetter ::= one of { 'a'..'z', 'A'..'Z' }
  22. IdentityEscape ::= (SourceCharacter but not IdentifierPart) | <ZWJ> | <ZWNJ>
  23. DecimalEscape ::= DecimalIntegerLiteral [lookahead not in DecimalDigit]
  24. CharacterClassEscape :: one of { 'd', 'D', 's', 'S', 'w', 'W' }
  25. CharacterClass ::= '[' [lookahead not in {'^'}] ClassRanges ']' | '[' '^' ClassRanges ']'
  26. ClassRanges ::= [empty] | NonemptyClassRanges
  27. NonemptyClassRanges ::= ClassAtom | ClassAtom NonemptyClassRangesNoDash | ClassAtom '-' ClassAtom ClassRanges
  28. NonemptyClassRangesNoDash ::= ClassAtom | ClassAtomNoDash NonemptyClassRangesNoDash | ClassAtomNoDash '-' ClassAtom ClassRanges
  29. ClassAtom ::= '-' | ClassAtomNoDash
  30. ClassAtomNoDash ::= SourceCharacter but not one of { '\', ']', '-' } | '\' ClassEscape
  31. ClassEscape ::= DecimalEscape | 'b' | CharacterEscape | CharacterClassEscape
  32. SourceCharacter ::= <unicode character>
  33. HexEscapeSequence ::= 'x' HexDigit HexDigit
  34. UnicodeEscapeSequence ::= 'u' HexDigit HexDigit HexDigit HexDigit | 'u' '{' HexDigits '}'
  35. HexDigit ::= one of { '0'..'9', 'a'..'f', 'A'..'F' }
  36. IdentifierStart ::= UnicodeLetter | '$' | '_' | '\' UnicodeEscapeSequence
  37. IdentifierPart ::= IdentifierStart | UnicodeCombiningMark | UnicodeDigit | UnicodeConnectorPunctuation | <ZWNJ> | <ZWJ>
  38. UnicodeLetter ::= <unicode Uppercase letter> | <unicode Lowercase letter> | <unicode Titlecase letter> | <unicode Modifier letter> | <unicode Other letter> | <unicode Letter number>
  39. UnicodeCombiningMark = <unicode Non-spacing mark> | <unicode combining spacing mark>
  40. UnicodeDigit ::= <unicode Decimal number>
  41. UnicodeConnectorPunctuation ::= <unicode Connector punctuation>
  42. DecimalIntegerLiteral ::= '0' | NonZeroDigit DecimalDigits?
  43. DecimalDigits ::= DecimalDigit | DecimalDigits DecimalDigit
  44. DecimalDigit ::= one of { '0'..'9' }
  45. NonZeroDigit ::= one of { '1'..'9' }
  46. ------------------
  47. Annex B Deviations
  48. ------------------
  49. 1. The assertions (?= ) and (?! ) are treated as though they have a surrounding non-capture group, and hence can be quantified.
  50. Other assertions are not quantifiable.
  51. QuantifiableAssertion [added] ::= '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
  52. Assertion [replaced] ::= '^' | '$' | '\' 'b' | '\' 'B' | QuantifiableAssertion
  53. Term ::= ... | QuantifiableAssertion Quantifier [added]
  54. --------------------------------------------------------------------------------------------------------------------------------
  55. Left factored
  56. --------------------------------------------------------------------------------------------------------------------------------
  57. Pattern ::= Disjunction
  58. Disjunction ::= Alternative ('|' Alternative)*
  59. FOLLOW(Disjunction) = { <eof>, ')' }
  60. Alternative ::= Term*
  61. FOLLOW(Alternative) = { <eof>, ')', '|' }
  62. Term ::= ( '^' | '$' | '\' 'b' | '\' 'B' | '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')' | Atom Quantifier? )
  63. | ( PatternCharacter | '.' | '\' AtomEscape | CharacterClass | '(' Disjunction ')' | '(' '?' ':' Disjunction ')' )
  64. ( '*' | '+' | '?' | '{' DecimalDigits (',' DecimalDigits?)? '}' ) '?'?
  65. PatternCharacter ::= <unicode character> but not any of { '^', '$', '\', '.', '*', '+', '?', '(', ')', '[', ']', '{', '}', '|' }
  66. AtomEscape ::= DecimalEscape | CharacterEscape | CharacterClassEscape
  67. CharacterEscape ::= ControlEscape | 'c' ControlLetter | HexEscapeSequence | UnicodeEscapeSequence | IdentityEscape
  68. ControlEscape ::= one of { 'f', 'n', 'r', 't', 'v' }
  69. ControlLetter ::= one of { 'a'..'z', 'A'..'Z' }
  70. IdentityEscape ::= <unicode character> but not <unicode Uppercase letter>, <unicode Lowercase letter>, <unicode Titlecase letter>, <unicode Modifier letter>, <unicode Other letter>, <unicode Letter number>, '$', '_', <unicode Non-spacing mark>, <unicode combining spacing mark>, <unicode Decimal number>, <unicode Connector punctuation>
  71. DecimalEscape ::= DecimalIntegerLiteral [lookahead not in DecimalDigit]
  72. CharacterClassEscape :: one of { 'd', 'D', 's', 'S', 'w', 'W' }
  73. CharacterClass ::= '[' [lookahead not in {'^'}] ClassRanges ']' | '[' '^' ClassRanges ']'
  74. ClassRanges ::= [empty] | NonemptyClassRanges
  75. NonemptyClassRanges ::= ClassAtom | ClassAtom NonemptyClassRangesNoDash | ClassAtom '-' ClassAtom ClassRanges
  76. NonemptyClassRangesNoDash ::= ClassAtom | ClassAtomNoDash NonemptyClassRangesNoDash | ClassAtomNoDash '-' ClassAtom ClassRanges
  77. ClassAtom ::= '-' | ClassAtomNoDash
  78. ClassAtomNoDash ::= SourceCharacter but not one of { '\', ']', '-' } | '\' ClassEscape
  79. ClassEscape ::= DecimalEscape | 'b' | CharacterEscape | CharacterClassEscape
  80. HexEscapeSequence ::= 'x' HexDigit{2}
  81. UnicodeEscapeSequence ::= 'u' HexDigit{4} | 'u' '{' HexDigits{4-6} '}'
  82. HexDigit ::= one of { '0'..'9', 'a'..'f', 'A'..'F' }
  83. DecimalIntegerLiteral ::= '0' | NonZeroDigit DecimalDigits?
  84. DecimalDigits ::= DecimalDigit+
  85. DecimalDigit ::= one of { '0'..'9' }
  86. NonZeroDigit ::= one of { '1'..'9' }
  87. ------------------
  88. Annex B Deviations
  89. ------------------
  90. QuantifiableAssertion [added] ::= '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
  91. Term ::= ... | '(' '?' '=' Disjunction ')' [removed] | '(' '?' '!' Disjunction ')' [removed] | QuantifiableAssertion Quantifier? [added]
  92. */
  93. #include "ParserPch.h"
  94. namespace UnifiedRegex
  95. {
  96. template <typename P, const bool IsLiteral>
  97. const CharCount Parser<P, IsLiteral>::initLitbufSize;
  98. ParseError::ParseError(bool isBody, CharCount pos, CharCount encodedPos, HRESULT error)
  99. : isBody(isBody), pos(pos), encodedPos(encodedPos), error(error)
  100. {
  101. }
  102. template <typename P, const bool IsLiteral>
  103. Parser<P, IsLiteral>::Parser
  104. ( Js::ScriptContext* scriptContext
  105. , ArenaAllocator* ctAllocator
  106. , StandardChars<EncodedChar>* standardEncodedChars
  107. , StandardChars<Char>* standardChars
  108. , bool isUtf8
  109. #if ENABLE_REGEX_CONFIG_OPTIONS
  110. , DebugWriter* w
  111. #endif
  112. )
  113. : scriptContext(scriptContext)
  114. , ctAllocator(ctAllocator)
  115. , standardEncodedChars(standardEncodedChars)
  116. , standardChars(standardChars)
  117. #if ENABLE_REGEX_CONFIG_OPTIONS
  118. , w(w)
  119. #endif
  120. , input(0)
  121. , inputLim(0)
  122. , next(0)
  123. , inBody(false)
  124. , numGroups(1)
  125. , nextGroupId(1) // implicit overall group always takes index 0
  126. , litbuf(0)
  127. , litbufLen(0)
  128. , litbufNext(0)
  129. , surrogatePairList(nullptr)
  130. , currentSurrogatePairNode(nullptr)
  131. , tempLocationOfSurrogatePair(nullptr)
  132. , tempLocationOfRange(nullptr)
  133. , codePointAtTempLocation(0)
  134. , unicodeFlagPresent(false)
  135. , caseInsensitiveFlagPresent(false)
  136. , positionAfterLastSurrogate(nullptr)
  137. , valueOfLastSurrogate(INVALID_CODEPOINT)
  138. , deferredIfNotUnicodeError(nullptr)
  139. , deferredIfUnicodeError(nullptr)
  140. {
  141. this->SetIsUtf8(isUtf8);
  142. }
  143. //
  144. // Input buffer management
  145. //
  146. template <typename P, const bool IsLiteral>
  147. void Parser<P, IsLiteral>::SetPosition(const EncodedChar* input, const EncodedChar* inputLim, bool inBody)
  148. {
  149. this->input = input;
  150. this->inputLim = inputLim;
  151. next = input;
  152. this->inBody = inBody;
  153. this->RestoreMultiUnits(0);
  154. }
  155. template <typename P, const bool IsLiteral>
  156. inline CharCount Parser<P, IsLiteral>::Pos()
  157. {
  158. CharCount nextOffset = Chars<EncodedChar>::OSB(next, input);
  159. Assert(nextOffset >= this->m_cMultiUnits);
  160. return nextOffset - (CharCount) this->m_cMultiUnits;
  161. }
  162. template <typename P, const bool IsLiteral>
  163. inline bool Parser<P, IsLiteral>::IsEOF()
  164. {
  165. return next >= inputLim;
  166. }
  167. template <typename P, const bool IsLiteral>
  168. inline bool Parser<P, IsLiteral>::ECCanConsume(CharCount n /*= 1*/)
  169. {
  170. return next + n <= inputLim;
  171. }
  172. template <typename P, const bool IsLiteral>
  173. inline typename P::EncodedChar Parser<P, IsLiteral>::ECLookahead(CharCount n /*= 0*/)
  174. {
  175. // Ok to look ahead to terminating 0
  176. Assert(next + n <= inputLim);
  177. return next[n];
  178. }
  179. template <typename P, const bool IsLiteral>
  180. inline typename P::EncodedChar Parser<P, IsLiteral>::ECLookback(CharCount n /*= 0*/)
  181. {
  182. // Ok to look ahead to terminating 0
  183. Assert(n + input <= next);
  184. return *(next - n);
  185. }
  186. template <typename P, const bool IsLiteral>
  187. inline void Parser<P, IsLiteral>::ECConsume(CharCount n /*= 1*/)
  188. {
  189. Assert(next + n <= inputLim);
  190. #if DBG
  191. for (CharCount i = 0; i < n; i++)
  192. Assert(!this->IsMultiUnitChar(next[i]));
  193. #endif
  194. next += n;
  195. }
  196. template <typename P, const bool IsLiteral>
  197. inline void Parser<P, IsLiteral>::ECConsumeMultiUnit(CharCount n /*= 1*/)
  198. {
  199. Assert(next + n <= inputLim);
  200. next += n;
  201. }
  202. template <typename P, const bool IsLiteral>
  203. inline void Parser<P, IsLiteral>::ECRevert(CharCount n /*= 1*/)
  204. {
  205. Assert(n + input <= next);
  206. next -= n;
  207. }
  208. //
  209. // Helpers
  210. //
  211. template <typename P, const bool IsLiteral>
  212. int Parser<P, IsLiteral>::TryParseExtendedUnicodeEscape(Char& c, bool& previousSurrogatePart, bool trackSurrogatePair /* = false */)
  213. {
  214. if (!scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  215. {
  216. return 0;
  217. }
  218. if (!ECCanConsume(2) || ECLookahead(0) !='{' || !standardEncodedChars->IsHex(ECLookahead(1)))
  219. {
  220. return 0;
  221. }
  222. // The first character is mandatory to consume escape sequence, so we check for it above, at this stage we can set it as we already checked.
  223. codepoint_t codePoint = standardEncodedChars->DigitValue(ECLookahead(1));
  224. int i = 2;
  225. while(ECCanConsume(i + 1) && standardEncodedChars->IsHex(ECLookahead(i)))
  226. {
  227. codePoint <<= 4;
  228. codePoint += standardEncodedChars->DigitValue(ECLookahead(i));
  229. if (codePoint > 0x10FFFF)
  230. {
  231. DeferredFailIfUnicode(JSERR_RegExpInvalidEscape);
  232. return 0;
  233. }
  234. i++;
  235. }
  236. if(!ECCanConsume(i + 1) || ECLookahead(i) != '}')
  237. {
  238. return 0;
  239. }
  240. uint consumptionNumber = i + 1;
  241. Assert(consumptionNumber >= 3);
  242. if (!previousSurrogatePart && trackSurrogatePair && this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  243. {
  244. // Current location
  245. TrackIfSurrogatePair(codePoint, (next - 1), consumptionNumber + 1);
  246. }
  247. char16 other;
  248. // Generally if this code point is a single character, then we take it and return.
  249. // If the character is made up of two characters then we emit the first and backtrack to the start of th escape sequence;
  250. // Following that we check if we have already seen the first character, and if so emit the second and consume the entire escape sequence.
  251. if (codePoint < 0x10000)
  252. {
  253. c = UTC(codePoint);
  254. ECConsumeMultiUnit(consumptionNumber);
  255. }
  256. else if (previousSurrogatePart)
  257. {
  258. previousSurrogatePart = false;
  259. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &other, &c);
  260. ECConsumeMultiUnit(consumptionNumber);
  261. }
  262. else
  263. {
  264. previousSurrogatePart = true;
  265. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &c, &other);
  266. Assert(ECLookback(1) == 'u' && ECLookback(2) == '\\');
  267. ECRevert(2);
  268. }
  269. return consumptionNumber;
  270. }
  271. // This function has the following 'knowledge':
  272. // - A codepoint that might be part of a surrogate pair or part of one
  273. // - A location where that codepoint is located
  274. // - Previously tracked part of surrogate pair (tempLocationOfSurrogatePair, and codePointAtTempLocation), which is always a surrogate lower part.
  275. // - A pointer to the current location of the linked list
  276. // - If a previous location is tracked, then it is of a parsed character (not given character) before current, and not same to current
  277. // This can't be asserted directly, and has to be followed by callers. Term pass can reset with each iteration, as well as this method in cases it needs.
  278. template <typename P, const bool IsLiteral>
  279. void Parser<P, IsLiteral>:: TrackIfSurrogatePair(codepoint_t codePoint, const EncodedChar* location, uint32 consumptionLength)
  280. {
  281. Assert(codePoint < 0x110000);
  282. Assert(location != nullptr);
  283. Assert(location != this->tempLocationOfSurrogatePair);
  284. if (Js::NumberUtilities::IsSurrogateLowerPart(codePoint))
  285. {
  286. this->tempLocationOfSurrogatePair = location;
  287. this->codePointAtTempLocation = codePoint;
  288. }
  289. else
  290. {
  291. if(Js::NumberUtilities::IsSurrogateUpperPart(codePoint) && this->tempLocationOfSurrogatePair != nullptr)
  292. {
  293. Assert(Js::NumberUtilities::IsSurrogateLowerPart(codePointAtTempLocation));
  294. consumptionLength = (uint32)(location - this->tempLocationOfSurrogatePair) + consumptionLength;
  295. codePoint = Js::NumberUtilities::SurrogatePairAsCodePoint(codePointAtTempLocation, codePoint);
  296. location = this->tempLocationOfSurrogatePair;
  297. }
  298. // At this point we can clear previous location, and then if codePoint is bigger than 0xFFFF store it, as we either received it or combined it above
  299. this->tempLocationOfSurrogatePair = nullptr;
  300. this->codePointAtTempLocation = 0;
  301. }
  302. if (codePoint > 0xFFFF)
  303. {
  304. this->positionAfterLastSurrogate = location + consumptionLength;
  305. this->valueOfLastSurrogate = codePoint;
  306. // When parsing without AST we aren't given an allocator. In addition, only the 2 lines above are used during Pass 0;
  307. // while the bottom is used during Pass 1 (which isn't done when ParseNoAST)
  308. if(this->ctAllocator != nullptr)
  309. {
  310. SurrogatePairTracker* node = Anew(this->ctAllocator, SurrogatePairTracker, location, this->tempLocationOfRange, codePoint, consumptionLength, this->m_cMultiUnits);
  311. if (surrogatePairList == nullptr)
  312. {
  313. Assert(currentSurrogatePairNode == nullptr);
  314. surrogatePairList = node;
  315. currentSurrogatePairNode = node;
  316. }
  317. else
  318. {
  319. Assert(currentSurrogatePairNode != nullptr);
  320. currentSurrogatePairNode->next = node;
  321. currentSurrogatePairNode = node;
  322. }
  323. }
  324. }
  325. }
  326. template <typename P, const bool IsLiteral>
  327. Node* Parser<P, IsLiteral>::CreateSurrogatePairAtom(char16 lower, char16 upper)
  328. {
  329. MatchLiteralNode * literalNode = Anew(this->ctAllocator, MatchLiteralNode, 0, 0);
  330. MatchCharNode lowerNode(lower);
  331. MatchCharNode upperNode(upper);
  332. AccumLiteral(literalNode, &lowerNode);
  333. AccumLiteral(literalNode, &upperNode);
  334. return literalNode;
  335. }
  336. // This function will create appropriate pairs of ranges and add them to the disjunction node.
  337. // Terms used in comments:
  338. // - A minor codePoint is smaller than major codePoint, and both define a range of codePoints above 0x10000; to avoid confusion between lower/upper denoting codeUnits composing the surrogate pair.
  339. // - A boundary is a mod 0x400 alignment marker due to the nature of surrogate pairs representation. So the codepoint 0x10300 lies between boundaries 0x10000 and 0x10400.
  340. // - A prefix is the range set used to represent the values from minorCodePoint to the first boundary above minorCodePoint if applicable.
  341. // - A suffix is the range set used to represent the values from first boundary below majorCodePoint to the majorCodePoint if applicable.
  342. // - A full range is the range set used to represent the values from first boundary above minorCodePoint to first boundary below majorCodePoint if applicable.
  343. // The algorithm works as follows:
  344. // 1. Determine minorBoundary (minorCodePoint - mod 0x400 +0x400) and majorBoundary (majorCodePoint - mod 0x400). minorBoundary > minorCodePoint and majorBoundary < majorCodePoint
  345. // 2. Based on the codePoints and the boundaries, prefix, suffix, and full range is determined. Here are the rules:
  346. // 2-a. If minorBoundary > majorBoundary, we have an inner boundary range, output just that.
  347. // 2-b. If minorBoundary - 0x400u != minorCodepoint (i.e. codePoint doesn't lie right on a boundary to be part of a full range) we have a prefix.
  348. // 2-c. If majorBoundary + 0x3FFu != majorCodepoint (i.e. codePoint doesn't lie right before a boundary to be part of a full range) we have a suffix.
  349. // 2-d. We have a full range, if the two boundaries don't equal, OR the codePoints lie on the range boundaries opposite to what constitutes a prefix/suffix.
  350. // Visual representation for sample range 0x10300 - 0x10900
  351. // | [ _ | ^ ] |
  352. // 0x10000 0x10800 0x11000
  353. // [ ] - denote the actual range
  354. // _ - minorBoundary
  355. // ^ - majorBoundary
  356. // | - other boundaries
  357. // prefix is between [ and _
  358. // suffix is between ^ and ]
  359. // full range is between _ and ^
  360. template <typename P, const bool IsLiteral>
  361. AltNode* Parser<P, IsLiteral>::AppendSurrogateRangeToDisjunction(codepoint_t minorCodePoint, codepoint_t majorCodePoint, AltNode *lastAltNode)
  362. {
  363. Assert(minorCodePoint < majorCodePoint);
  364. Assert(minorCodePoint >= 0x10000u);
  365. Assert(majorCodePoint >= 0x10000u);
  366. Assert(scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() && unicodeFlagPresent);
  367. char16 lowerMinorCodeUnit, upperMinorCodeUnit, lowerMajorCodeUnit, upperMajorCodeUnit;
  368. Js::NumberUtilities::CodePointAsSurrogatePair(minorCodePoint, &lowerMinorCodeUnit, &upperMinorCodeUnit);
  369. Js::NumberUtilities::CodePointAsSurrogatePair(majorCodePoint, &lowerMajorCodeUnit, &upperMajorCodeUnit);
  370. // These boundaries represent whole range boundaries, as in 0x10000, 0x10400, 0x10800 etc
  371. // minor boundary is the first boundary strictly above minorCodePoint
  372. // major boundary is the first boundary below or equal to majorCodePoint
  373. codepoint_t minorBoundary = minorCodePoint - (minorCodePoint % 0x400u) + 0x400u;
  374. codepoint_t majorBoundary = majorCodePoint - (majorCodePoint % 0x400u);
  375. Assert(minorBoundary >= 0x10000);
  376. Assert(majorBoundary >= 0x10000);
  377. AltNode* tailToAdd = nullptr;
  378. // If the minor boundary is higher than major boundary, that means we have a range within the boundary and is less than 0x400
  379. // Ex: 0x10430 - 0x10700 will have minor boundary of 0x10800 and major of 0x10400
  380. // This pair will be represented in single range set.
  381. const bool singleRange = minorBoundary > majorBoundary;
  382. if (singleRange)
  383. {
  384. Assert(majorCodePoint - minorCodePoint < 0x400u);
  385. Assert(lowerMinorCodeUnit == lowerMajorCodeUnit);
  386. MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, lowerMinorCodeUnit);
  387. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
  388. setNode->set.SetRange(ctAllocator, (Char)upperMinorCodeUnit, (Char)upperMajorCodeUnit);
  389. ConcatNode* concatNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, setNode, nullptr));
  390. tailToAdd = Anew(ctAllocator, AltNode, concatNode, nullptr);
  391. }
  392. else
  393. {
  394. Node* prefixNode = nullptr, *suffixNode = nullptr;
  395. const bool twoConsecutiveRanges = minorBoundary == majorBoundary;
  396. // For minorBoundary,
  397. if (minorBoundary - minorCodePoint == 1) // Single character in minor range
  398. {
  399. // The prefix is only a surrogate pair atom
  400. prefixNode = CreateSurrogatePairAtom(lowerMinorCodeUnit, upperMinorCodeUnit);
  401. }
  402. else if (minorCodePoint != minorBoundary - 0x400u) // Minor range isn't full
  403. {
  404. Assert(minorBoundary - minorCodePoint < 0x400u);
  405. MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, (Char)lowerMinorCodeUnit);
  406. MatchSetNode* upperSetNode = Anew(ctAllocator, MatchSetNode, false);
  407. upperSetNode->set.SetRange(ctAllocator, (Char)upperMinorCodeUnit, (Char)0xDFFFu);
  408. prefixNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, upperSetNode, nullptr));
  409. }
  410. else // Full minor range
  411. {
  412. minorBoundary -= 0x400u;
  413. }
  414. if (majorBoundary == majorCodePoint) // Single character in major range
  415. {
  416. // The suffix is only a surrogate pair atom
  417. suffixNode = CreateSurrogatePairAtom(lowerMajorCodeUnit, upperMajorCodeUnit);
  418. majorBoundary -= 0x400u;
  419. }
  420. else if (majorBoundary + 0x3FFu != majorCodePoint) // Major range isn't full
  421. {
  422. Assert(majorCodePoint - majorBoundary < 0x3FFu);
  423. MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, (Char)lowerMajorCodeUnit);
  424. MatchSetNode* upperSetNode = Anew(ctAllocator, MatchSetNode, false, false);
  425. upperSetNode->set.SetRange(ctAllocator, (Char)0xDC00u, (Char)upperMajorCodeUnit);
  426. suffixNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, upperSetNode, nullptr));
  427. majorBoundary -= 0x400u;
  428. }
  429. const bool nonFullConsecutiveRanges = twoConsecutiveRanges && prefixNode != nullptr && suffixNode != nullptr;
  430. if (nonFullConsecutiveRanges)
  431. {
  432. Assert(suffixNode != nullptr);
  433. Assert(minorCodePoint != minorBoundary - 0x400u);
  434. Assert(majorBoundary + 0x3FFu != majorCodePoint);
  435. // If the minor boundary is equal to major boundary, that means we have a cross boundary range that only needs 2 nodes for prefix/suffix.
  436. // We can only cross one boundary.
  437. Assert(majorCodePoint - minorCodePoint < 0x800u);
  438. tailToAdd = Anew(ctAllocator, AltNode, prefixNode, Anew(ctAllocator, AltNode, suffixNode, nullptr));
  439. }
  440. else
  441. {
  442. // We have 3 sets of ranges, comprising of prefix, full and suffix.
  443. Assert(majorCodePoint - minorCodePoint >= 0x400u);
  444. Assert((prefixNode != nullptr && suffixNode != nullptr) // Spanning more than two ranges
  445. || (prefixNode == nullptr && minorBoundary == minorCodePoint) // Two consecutive ranges and the minor is full
  446. || (suffixNode == nullptr && majorBoundary + 0x3FFu == majorCodePoint)); // Two consecutive ranges and the major is full
  447. Node* lowerOfFullRange;
  448. char16 lowerMinorBoundary, lowerMajorBoundary, ignore;
  449. Js::NumberUtilities::CodePointAsSurrogatePair(minorBoundary, &lowerMinorBoundary, &ignore);
  450. bool singleFullRange = majorBoundary == minorBoundary;
  451. if (singleFullRange)
  452. {
  453. // The lower part of the full range is simple a surrogate lower char
  454. lowerOfFullRange = Anew(ctAllocator, MatchCharNode, (Char)lowerMinorBoundary);
  455. }
  456. else
  457. {
  458. Js::NumberUtilities::CodePointAsSurrogatePair(majorBoundary, &lowerMajorBoundary, &ignore);
  459. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
  460. setNode->set.SetRange(ctAllocator, (Char)lowerMinorBoundary, (Char)lowerMajorBoundary);
  461. lowerOfFullRange = setNode;
  462. }
  463. MatchSetNode* fullUpperRange = Anew(ctAllocator, MatchSetNode, false, false);
  464. fullUpperRange->set.SetRange(ctAllocator, (Char)0xDC00u, (Char)0xDFFFu);
  465. // These are added in the following order [full] [prefix][suffix]
  466. // This is doing by prepending, so in reverse.
  467. if (suffixNode != nullptr)
  468. {
  469. tailToAdd = Anew(ctAllocator, AltNode, suffixNode, tailToAdd);
  470. }
  471. if (prefixNode != nullptr)
  472. {
  473. tailToAdd = Anew(ctAllocator, AltNode, prefixNode, tailToAdd);
  474. }
  475. tailToAdd = Anew(ctAllocator, AltNode, Anew(ctAllocator, ConcatNode, lowerOfFullRange, Anew(ctAllocator, ConcatNode, fullUpperRange, nullptr)), tailToAdd);
  476. }
  477. }
  478. if (lastAltNode != nullptr)
  479. {
  480. Assert(lastAltNode->tail == nullptr);
  481. lastAltNode->tail = tailToAdd;
  482. }
  483. return tailToAdd;
  484. }
  485. template <typename P, const bool IsLiteral>
  486. AltNode* Parser<P, IsLiteral>::AppendSurrogatePairToDisjunction(codepoint_t codePoint, AltNode *lastAltNode)
  487. {
  488. char16 lower, upper;
  489. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &lower, &upper);
  490. AltNode* tailNode = Anew(ctAllocator, AltNode, CreateSurrogatePairAtom(lower, upper), nullptr);
  491. if (lastAltNode != nullptr)
  492. {
  493. lastAltNode->tail = tailNode;
  494. }
  495. return tailNode;
  496. }
  497. //
  498. // Errors
  499. //
  500. template <typename P, const bool IsLiteral>
  501. void Parser<P, IsLiteral>::Fail(HRESULT error)
  502. {
  503. throw ParseError(inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
  504. }
  505. // This doesn't throw, but stores first error code for throwing later
  506. template <typename P, const bool IsLiteral>
  507. void Parser<P, IsLiteral>::DeferredFailIfUnicode(HRESULT error)
  508. {
  509. Assert(this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  510. if (this->deferredIfUnicodeError == nullptr)
  511. {
  512. this->deferredIfUnicodeError = Anew(ctAllocator, ParseError, inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
  513. }
  514. }
  515. // This doesn't throw, but stores first error code for throwing later
  516. template <typename P, const bool IsLiteral>
  517. void Parser<P, IsLiteral>::DeferredFailIfNotUnicode(HRESULT error)
  518. {
  519. Assert(this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  520. if (this->deferredIfNotUnicodeError == nullptr)
  521. {
  522. this->deferredIfNotUnicodeError = Anew(ctAllocator, ParseError, inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
  523. }
  524. }
  525. template <typename P, const bool IsLiteral>
  526. inline void Parser<P, IsLiteral>::ECMust(EncodedChar ec, HRESULT error)
  527. {
  528. // We never look for 0
  529. Assert(ec != 0);
  530. if (ECLookahead() != ec)
  531. Fail(error);
  532. ECConsume();
  533. }
  534. template <typename P, const bool IsLiteral>
  535. inline char16 Parser<P, IsLiteral>::NextChar()
  536. {
  537. Assert(!IsEOF());
  538. // Could be an embedded 0
  539. Char c = this->template ReadFull<true>(next, inputLim);
  540. // No embedded newlines in literals
  541. if (IsLiteral && standardChars->IsNewline(c))
  542. Fail(ERRnoSlash);
  543. return c;
  544. }
  545. //
  546. // Patterns/Disjunctions/Alternatives
  547. //
  548. template <typename P, const bool IsLiteral>
  549. void Parser<P, IsLiteral>::PatternPass0()
  550. {
  551. this->positionAfterLastSurrogate = nullptr;
  552. this->deferredIfNotUnicodeError = nullptr;
  553. this->deferredIfUnicodeError = nullptr;
  554. DisjunctionPass0(0);
  555. }
  556. template <typename P, const bool IsLiteral>
  557. Node* Parser<P, IsLiteral>::PatternPass1()
  558. {
  559. return DisjunctionPass1();
  560. }
  561. template <typename P, const bool IsLiteral>
  562. Node* Parser<P, IsLiteral>::UnionNodes(Node* prev, Node* curr)
  563. {
  564. if (prev->tag == Node::MatchChar)
  565. {
  566. MatchCharNode* prevChar = (MatchCharNode*)prev;
  567. if (curr->tag == Node::MatchChar)
  568. {
  569. MatchCharNode* currChar = (MatchCharNode*)curr;
  570. if (prevChar->cs[0] == currChar->cs[0])
  571. // Just ignore current node
  572. return prevChar;
  573. else
  574. {
  575. // Union chars into new set
  576. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false);
  577. setNode->set.Set(ctAllocator, prevChar->cs[0]);
  578. setNode->set.Set(ctAllocator, currChar->cs[0]);
  579. return setNode;
  580. }
  581. }
  582. else if (curr->tag == Node::MatchSet)
  583. {
  584. MatchSetNode* currSet = (MatchSetNode*)curr;
  585. if (currSet->isNegation)
  586. // Can't merge
  587. return 0;
  588. else
  589. {
  590. // Union chars into new set
  591. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false);
  592. setNode->set.Set(ctAllocator, prevChar->cs[0]) ;
  593. setNode->set.UnionInPlace(ctAllocator, currSet->set);
  594. return setNode;
  595. }
  596. }
  597. else
  598. // Can't merge
  599. return 0;
  600. }
  601. else if (prev->tag == Node::MatchSet)
  602. {
  603. MatchSetNode* prevSet = (MatchSetNode*)prev;
  604. if (prevSet->isNegation)
  605. // Can't merge
  606. return 0;
  607. else if (curr->tag == Node::MatchChar)
  608. {
  609. MatchCharNode* currChar = (MatchCharNode*)curr;
  610. // Include char in prev set
  611. prevSet->set.Set(ctAllocator, currChar->cs[0]);
  612. return prevSet;
  613. }
  614. else if (curr->tag == Node::MatchSet)
  615. {
  616. MatchSetNode* currSet = (MatchSetNode*)curr;
  617. if (currSet->isNegation)
  618. // Can't merge
  619. return 0;
  620. else
  621. {
  622. // Include chars in prev set
  623. prevSet->set.UnionInPlace(ctAllocator, currSet->set);
  624. return prevSet;
  625. }
  626. }
  627. else
  628. // Can't merge
  629. return 0;
  630. }
  631. else
  632. // Can't merge
  633. return 0;
  634. }
  635. template <typename P, const bool IsLiteral>
  636. void Parser<P, IsLiteral>::DisjunctionPass0(int depth)
  637. {
  638. AlternativePass0(depth);
  639. while (true)
  640. {
  641. // Could be terminating 0
  642. if (ECLookahead() != '|')
  643. return;
  644. ECConsume();
  645. AlternativePass0(depth);
  646. }
  647. }
  648. template <typename P, const bool IsLiteral>
  649. Node* Parser<P, IsLiteral>::DisjunctionPass1()
  650. {
  651. // Maintain the invariants:
  652. // - alt lists have two or more items
  653. // - alt list items are never alt lists (so we must inline them)
  654. // - an alt list never contains two consecutive match-character/match-set nodes
  655. // (so we must union consecutive items into a single set)
  656. Node* node = AlternativePass1();
  657. AltNode* last = 0;
  658. // First node may be an alternative
  659. if (node->tag == Node::Alt)
  660. {
  661. last = (AltNode*)node;
  662. while (last->tail != 0)
  663. last = last->tail;
  664. }
  665. while (true)
  666. {
  667. // Could be terminating 0
  668. if (ECLookahead() != '|')
  669. return node;
  670. ECConsume(); // '|'
  671. Node* next = AlternativePass1();
  672. AnalysisAssert(next != nullptr);
  673. Node* revisedPrev = UnionNodes(last == 0 ? node : last->head, next);
  674. if (revisedPrev != 0)
  675. {
  676. // Can merge next into previously seen alternative
  677. if (last == 0)
  678. node = revisedPrev;
  679. else
  680. last->head = revisedPrev;
  681. }
  682. else if (next->tag == Node::Alt)
  683. {
  684. AltNode* nextList = (AltNode*)next;
  685. // Since we just had to dereference next to get here, we know that nextList
  686. // can't be nullptr in this case.
  687. AnalysisAssert(nextList != nullptr);
  688. // Append inner list to current list
  689. revisedPrev = UnionNodes(last == 0 ? node : last->head, nextList->head);
  690. if (revisedPrev != 0)
  691. {
  692. // Can merge head of list into previously seen alternative
  693. if (last ==0)
  694. node = revisedPrev;
  695. else
  696. last->head = revisedPrev;
  697. nextList = nextList->tail;
  698. }
  699. if (last == 0)
  700. node = Anew(ctAllocator, AltNode, node, nextList);
  701. else
  702. last->tail = nextList;
  703. AnalysisAssert(nextList != nullptr);
  704. while (nextList->tail != 0)
  705. nextList = nextList->tail;
  706. last = nextList;
  707. }
  708. else
  709. {
  710. // Append node
  711. AltNode* cons = Anew(ctAllocator, AltNode, next, 0);
  712. if (last == 0)
  713. node = Anew(ctAllocator, AltNode, node, cons);
  714. else
  715. last->tail = cons;
  716. last = cons;
  717. }
  718. }
  719. }
  720. template <typename P, const bool IsLiteral>
  721. inline bool Parser<P, IsLiteral>::IsEndOfAlternative()
  722. {
  723. EncodedChar ec = ECLookahead();
  724. // Could be terminating 0, but embedded 0 is part of alternative
  725. return (ec == 0 && IsEOF()) || ec == ')' || ec == '|' || (IsLiteral && ec == '/');
  726. }
  727. template <typename P, const bool IsLiteral>
  728. void Parser<P, IsLiteral>::EnsureLitbuf(CharCount size)
  729. {
  730. if (litbufLen - litbufNext < size)
  731. {
  732. CharCount newLen = max(litbufLen, initLitbufSize);
  733. while (newLen < litbufNext + size)
  734. newLen *= 2;
  735. litbuf = (Char*)ctAllocator->Realloc(litbuf, litbufLen * sizeof(Char), newLen * sizeof(Char));
  736. litbufLen = newLen;
  737. }
  738. }
  739. template <typename P, const bool IsLiteral>
  740. void Parser<P, IsLiteral>::AccumLiteral(MatchLiteralNode* deferredLiteralNode, Node* charOrLiteralNode)
  741. {
  742. Assert(charOrLiteralNode->tag == Node::MatchChar || charOrLiteralNode->tag == Node::MatchLiteral);
  743. CharCount addLen = charOrLiteralNode->LiteralLength();
  744. Assert(addLen > 0);
  745. if (deferredLiteralNode->length == 0)
  746. {
  747. // Start a new literal
  748. EnsureLitbuf(addLen);
  749. deferredLiteralNode->offset = litbufNext;
  750. deferredLiteralNode->length = addLen;
  751. charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
  752. }
  753. else if (deferredLiteralNode->offset + deferredLiteralNode->length == litbufNext)
  754. {
  755. // Keep growing the current literal
  756. EnsureLitbuf(addLen);
  757. charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
  758. deferredLiteralNode->length += addLen;
  759. }
  760. else if (charOrLiteralNode->tag == Node::MatchLiteral && deferredLiteralNode->offset + deferredLiteralNode->length == ((MatchLiteralNode*)charOrLiteralNode)->offset)
  761. {
  762. // Absorb next literal into current literal since they are adjacent
  763. deferredLiteralNode->length += addLen;
  764. }
  765. else
  766. {
  767. // Abandon current literal and start a fresh one (leaves gap)
  768. EnsureLitbuf(deferredLiteralNode->length + addLen);
  769. js_wmemcpy_s(litbuf + litbufNext, litbufLen - litbufNext, litbuf + deferredLiteralNode->offset, deferredLiteralNode->length);
  770. deferredLiteralNode->offset = litbufNext;
  771. litbufNext += deferredLiteralNode->length;
  772. charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
  773. deferredLiteralNode->length += addLen;
  774. }
  775. }
  776. template <typename P, const bool IsLiteral>
  777. Node* Parser<P, IsLiteral>::FinalTerm(Node* node, MatchLiteralNode* deferredLiteralNode)
  778. {
  779. if (node == deferredLiteralNode)
  780. {
  781. #if DBG
  782. if (deferredLiteralNode->length == 0)
  783. Assert(false);
  784. #endif
  785. Assert(deferredLiteralNode->offset < litbufNext);
  786. Assert(deferredLiteralNode->offset + deferredLiteralNode->length <= litbufNext);
  787. if (deferredLiteralNode->length == 1)
  788. {
  789. node = Anew(ctAllocator, MatchCharNode, litbuf[deferredLiteralNode->offset]);
  790. if (deferredLiteralNode->offset + deferredLiteralNode->length == litbufNext)
  791. // Reclaim last added character
  792. litbufNext--;
  793. // else: leave a gap in the literal buffer
  794. }
  795. else
  796. node = Anew(ctAllocator, MatchLiteralNode, *deferredLiteralNode);
  797. deferredLiteralNode->offset = 0;
  798. deferredLiteralNode->length = 0;
  799. }
  800. return node;
  801. }
  802. template <typename P, const bool IsLiteral>
  803. void Parser<P, IsLiteral>::AlternativePass0(int depth)
  804. {
  805. while (!IsEndOfAlternative())
  806. TermPass0(depth);
  807. }
  808. template <typename P, const bool IsLiteral>
  809. Node* Parser<P, IsLiteral>::AlternativePass1()
  810. {
  811. if (IsEndOfAlternative())
  812. return Anew(ctAllocator, SimpleNode, Node::Empty);
  813. MatchCharNode deferredCharNode(0);
  814. MatchLiteralNode deferredLiteralNode(0, 0);
  815. // Maintain the invariants:
  816. // - concat lists have two or more items
  817. // - concat list items are never concat lists
  818. // - a concat list never contains two consecutive match-character/match-literal nodes
  819. bool previousSurrogatePart = false;
  820. Node* node = TermPass1(&deferredCharNode, previousSurrogatePart);
  821. AnalysisAssert(node != nullptr);
  822. ConcatNode* last = 0;
  823. // First node may be a concat
  824. if (node->tag == Node::Concat)
  825. {
  826. last = (ConcatNode*)node;
  827. while (last->tail != 0)
  828. last = last->tail;
  829. }
  830. if (last == 0)
  831. {
  832. if (node->LiteralLength() > 0)
  833. {
  834. // Begin a new literal
  835. AccumLiteral(&deferredLiteralNode, node);
  836. node = &deferredLiteralNode;
  837. }
  838. }
  839. else
  840. {
  841. if (last->head->LiteralLength() > 0)
  842. {
  843. // Begin a new literal
  844. AccumLiteral(&deferredLiteralNode, last->head);
  845. last->head = &deferredLiteralNode;
  846. }
  847. }
  848. while (!IsEndOfAlternative())
  849. {
  850. Node* next = TermPass1(&deferredCharNode, previousSurrogatePart);
  851. AnalysisAssert(next != nullptr);
  852. if (next->LiteralLength() > 0)
  853. {
  854. // Begin a new literal or grow the existing literal
  855. AccumLiteral(&deferredLiteralNode, next);
  856. if (last == 0)
  857. {
  858. if (node != &deferredLiteralNode)
  859. {
  860. // So far we have first item and the current literal
  861. ConcatNode* cons = Anew(ctAllocator, ConcatNode, &deferredLiteralNode, 0);
  862. node = Anew(ctAllocator, ConcatNode, node, cons);
  863. last = cons;
  864. }
  865. // else: keep growing first literal
  866. }
  867. else
  868. {
  869. if (last->head != &deferredLiteralNode)
  870. {
  871. // Append a new literal node
  872. ConcatNode* cons = Anew(ctAllocator, ConcatNode, &deferredLiteralNode, 0);
  873. last->tail = cons;
  874. last = cons;
  875. }
  876. // else: keep growing current literal
  877. }
  878. }
  879. else if (next->tag == Node::Concat)
  880. {
  881. // Append this list to accumulated list
  882. ConcatNode* nextList = (ConcatNode*)next;
  883. if (nextList->head->LiteralLength() > 0 &&
  884. ((last == 0 && node == &deferredLiteralNode) ||
  885. (last != 0 && last->head == &deferredLiteralNode)))
  886. {
  887. // Absorb the next character or literal into the current literal
  888. // (may leave a gab in litbuf)
  889. AccumLiteral(&deferredLiteralNode, nextList->head);
  890. nextList = nextList->tail;
  891. // List has at least two items
  892. AnalysisAssert(nextList != 0);
  893. // List should be in canonical form, so no consecutive chars/literals
  894. Assert(nextList->head->LiteralLength() == 0);
  895. }
  896. if (last == 0)
  897. node = Anew(ctAllocator, ConcatNode, FinalTerm(node, &deferredLiteralNode), nextList);
  898. else
  899. {
  900. last->head = FinalTerm(last->head, &deferredLiteralNode);
  901. last->tail = nextList;
  902. }
  903. // NextList can't be nullptr, since it was last set as next, which
  904. // was dereferenced, or it was set on a path that already has this
  905. // analysis assert.
  906. AnalysisAssert(nextList != nullptr);
  907. while (nextList->tail != 0)
  908. nextList = nextList->tail;
  909. last = nextList;
  910. // No outstanding literals
  911. Assert(deferredLiteralNode.length == 0);
  912. // We just set this from nextList, which we know is not nullptr.
  913. AnalysisAssert(last != nullptr);
  914. if (last->head->LiteralLength() > 0)
  915. {
  916. // If the list ends with a literal, transfer it into deferredLiteralNode
  917. // so we can continue accumulating (won't leave a gab in litbuf)
  918. AccumLiteral(&deferredLiteralNode, last->head);
  919. // Can discard MatchLiteralNode since it lives in compile-time allocator
  920. last->head = &deferredLiteralNode;
  921. }
  922. }
  923. else
  924. {
  925. // Append this node to accumulated list
  926. ConcatNode* cons = Anew(ctAllocator, ConcatNode, next, 0);
  927. if (last == 0)
  928. node = Anew(ctAllocator, ConcatNode, FinalTerm(node, &deferredLiteralNode), cons);
  929. else
  930. {
  931. last->head = FinalTerm(last->head, &deferredLiteralNode);
  932. last->tail = cons;
  933. }
  934. last = cons;
  935. // No outstanding literals
  936. Assert(deferredLiteralNode.length == 0);
  937. }
  938. }
  939. if (last == 0)
  940. node = FinalTerm(node, &deferredLiteralNode);
  941. else
  942. last->head = FinalTerm(last->head, &deferredLiteralNode);
  943. // No outstanding literals
  944. Assert(deferredLiteralNode.length == 0);
  945. return node;
  946. }
  947. //
  948. // Terms
  949. //
  950. template <typename P, const bool IsLiteral>
  951. Node* Parser<P, IsLiteral>::NewLoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body)
  952. {
  953. //
  954. // NOTE: We'd like to represent r? (i.e. r{0,1}) as r|<empty> since the loop representation has high overhead.
  955. // HOWEVER if r contains a group definition and could match empty then we must execute as a loop
  956. // so that group bindings are correctly reset on no progress (e.g.: /(a*)?/.exec("")). Thus we defer
  957. // this optimization until pass 4 of the optimizer, at which point we know whether r could match empty.
  958. //
  959. if (lower == 1 && upper == 1)
  960. return body;
  961. else if (lower == 0 && upper == 0)
  962. // Loop is equivalent to empty. If the loop body contains group definitions they will have already been
  963. // counted towards the overall number of groups. The matcher will initialize their contents to
  964. // undefined, and since the loop body would never execute the inner groups could never be updated from
  965. // undefined.
  966. return Anew(ctAllocator, SimpleNode, Node::Empty);
  967. else
  968. return Anew(ctAllocator, LoopNode, lower, upper, isGreedy, body);
  969. }
  970. template <typename P, const bool IsLiteral>
  971. bool Parser<P, IsLiteral>::AtQuantifier()
  972. {
  973. // Could be terminating 0
  974. switch (ECLookahead())
  975. {
  976. case '*':
  977. case '+':
  978. case '?':
  979. return true;
  980. case '{':
  981. {
  982. CharCount lookahead = 1;
  983. while (ECCanConsume(lookahead + 1) && standardEncodedChars->IsDigit(ECLookahead(lookahead)))
  984. lookahead++;
  985. if (lookahead == 1 || !ECCanConsume(lookahead + 1))
  986. return false;
  987. switch (ECLookahead(lookahead))
  988. {
  989. case ',':
  990. lookahead++;
  991. if (ECCanConsume(lookahead + 1) && ECLookahead(lookahead) == '}')
  992. return true;
  993. else
  994. {
  995. CharCount saved = lookahead;
  996. while (ECCanConsume(lookahead + 1) && standardEncodedChars->IsDigit(ECLookahead(lookahead)))
  997. lookahead++;
  998. if (lookahead == saved)
  999. return false;
  1000. return ECCanConsume(lookahead + 1) && ECLookahead(lookahead) == '}';
  1001. }
  1002. case '}':
  1003. return true;
  1004. default:
  1005. return false;
  1006. }
  1007. }
  1008. default:
  1009. return false;
  1010. }
  1011. }
  1012. template <typename P, const bool IsLiteral>
  1013. bool Parser<P, IsLiteral>::OptNonGreedy()
  1014. {
  1015. // Could be terminating 0
  1016. if (ECLookahead() != '?')
  1017. return true;
  1018. ECConsume();
  1019. return false;
  1020. }
  1021. template <typename P, const bool IsLiteral>
  1022. CharCount Parser<P, IsLiteral>::RepeatCount()
  1023. {
  1024. CharCount n = 0;
  1025. int digits = 0;
  1026. while (true)
  1027. {
  1028. // Could be terminating 0
  1029. EncodedChar ec = ECLookahead();
  1030. if (!standardEncodedChars->IsDigit(ec))
  1031. {
  1032. if (digits == 0)
  1033. {
  1034. Fail(JSERR_RegExpSyntax);
  1035. }
  1036. return n;
  1037. }
  1038. if (n > MaxCharCount / 10)
  1039. {
  1040. break;
  1041. }
  1042. n *= 10;
  1043. if (n > MaxCharCount - standardEncodedChars->DigitValue(ec))
  1044. {
  1045. break;
  1046. }
  1047. n += standardEncodedChars->DigitValue(ec);
  1048. digits++;
  1049. ECConsume();
  1050. }
  1051. Assert(digits != 0); // shouldn't be able to reach here with (digits == 0)
  1052. // The token had a value larger than MaxCharCount so we didn't return the value and reached here instead.
  1053. // Consume the rest of the token and return MaxCharCount.
  1054. while (true)
  1055. {
  1056. EncodedChar ec = ECLookahead();
  1057. if (!standardEncodedChars->IsDigit(ec))
  1058. {
  1059. return MaxCharCount;
  1060. }
  1061. ECConsume();
  1062. }
  1063. }
  1064. template <typename P, const bool IsLiteral>
  1065. void Parser<P, IsLiteral>::TermPass0(int depth)
  1066. {
  1067. PROBE_STACK_NO_DISPOSE(scriptContext, Js::Constants::MinStackRegex);
  1068. // Either we have a location at the start, or the end, never both. As in between it should have been cleared if surrogate pair
  1069. // Or must be cleared if we didn't perform the check
  1070. bool clearLocationIfPresent = this->tempLocationOfSurrogatePair != nullptr;
  1071. switch (ECLookahead())
  1072. {
  1073. case '^':
  1074. case '$':
  1075. ECConsume();
  1076. return;
  1077. case '\\':
  1078. ECConsume();
  1079. if (AtomEscapePass0())
  1080. return;
  1081. break;
  1082. case '(':
  1083. // Can't combine into a single codeunit because of group present
  1084. this->tempLocationOfSurrogatePair = nullptr;
  1085. this->codePointAtTempLocation = 0;
  1086. clearLocationIfPresent = false;
  1087. ECConsume();
  1088. switch (ECLookahead())
  1089. {
  1090. case '?':
  1091. if (!ECCanConsume(2))
  1092. Fail(JSERR_RegExpSyntax);
  1093. switch (ECLookahead(1))
  1094. {
  1095. case '=':
  1096. case '!':
  1097. case ':':
  1098. ECConsume(2);
  1099. break;
  1100. default:
  1101. numGroups++;
  1102. break;
  1103. }
  1104. break;
  1105. default:
  1106. numGroups++;
  1107. break;
  1108. }
  1109. DisjunctionPass0(depth + 1);
  1110. ECMust(')', JSERR_RegExpNoParen);
  1111. break;
  1112. case '.':
  1113. ECConsume();
  1114. break;
  1115. case '[':
  1116. ECConsume();
  1117. this->tempLocationOfSurrogatePair = nullptr;
  1118. this->codePointAtTempLocation = 0;
  1119. this->tempLocationOfRange = next;
  1120. CharacterClassPass0();
  1121. this->tempLocationOfRange = nullptr;
  1122. ECMust(']', JSERR_RegExpNoBracket);
  1123. break;
  1124. case ')':
  1125. case '|':
  1126. Fail(JSERR_RegExpSyntax);
  1127. break;
  1128. case ']':
  1129. case '}':
  1130. NextChar();
  1131. break;
  1132. case '*':
  1133. case '+':
  1134. case '?':
  1135. case '{':
  1136. if (AtQuantifier())
  1137. Fail(JSERR_RegExpBadQuant);
  1138. else
  1139. ECConsume();
  1140. break;
  1141. case 0:
  1142. if (IsEOF())
  1143. // Terminating 0
  1144. Fail(JSERR_RegExpSyntax);
  1145. // else fall-through for embedded 0
  1146. default:
  1147. {
  1148. const EncodedChar* current = next;
  1149. Char c = NextChar();
  1150. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1151. {
  1152. TrackIfSurrogatePair(c, current, (uint32)(next - current));
  1153. }
  1154. // Closing '/' in literals should be caught explicitly
  1155. Assert(!IsLiteral || c != '/');
  1156. }
  1157. break;
  1158. }
  1159. if (clearLocationIfPresent && this->tempLocationOfSurrogatePair != nullptr)
  1160. {
  1161. this->tempLocationOfSurrogatePair = nullptr;
  1162. this->codePointAtTempLocation = 0;
  1163. }
  1164. if (AtQuantifier())
  1165. {
  1166. switch (ECLookahead())
  1167. {
  1168. case '*':
  1169. case '+':
  1170. case '?':
  1171. ECConsume();
  1172. OptNonGreedy();
  1173. break;
  1174. case '{':
  1175. {
  1176. ECConsume();
  1177. CharCount lower = RepeatCount();
  1178. switch (ECLookahead())
  1179. {
  1180. case ',':
  1181. ECConsume();
  1182. if (ECLookahead() == '}')
  1183. {
  1184. ECConsume();
  1185. OptNonGreedy();
  1186. }
  1187. else
  1188. {
  1189. CharCount upper = RepeatCount();
  1190. if (upper < lower)
  1191. Fail(JSERR_RegExpSyntax);
  1192. Assert(ECLookahead() == '}');
  1193. ECConsume();
  1194. OptNonGreedy();
  1195. }
  1196. break;
  1197. case '}':
  1198. ECConsume();
  1199. OptNonGreedy();
  1200. break;
  1201. default:
  1202. Assert(false);
  1203. break;
  1204. }
  1205. break;
  1206. }
  1207. default:
  1208. Assert(false);
  1209. break;
  1210. }
  1211. }
  1212. }
  1213. template <typename P, const bool IsLiteral>
  1214. Node* Parser<P, IsLiteral>::TermPass1(MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
  1215. {
  1216. PROBE_STACK_NO_DISPOSE(scriptContext, Js::Constants::MinStackRegex);
  1217. Node* node = 0;
  1218. bool containsSurrogatePair = false;
  1219. switch (ECLookahead())
  1220. {
  1221. case '^':
  1222. ECConsume();
  1223. return Anew(ctAllocator, SimpleNode, Node::BOL); // No quantifier allowed
  1224. case '$':
  1225. ECConsume();
  1226. return Anew(ctAllocator, SimpleNode, Node::EOL); // No quantifier allowed
  1227. case '\\':
  1228. ECConsume();
  1229. if(SurrogatePairPass1(node, deferredCharNode, previousSurrogatePart))
  1230. {
  1231. break; // For quantifier
  1232. }
  1233. else if (AtomEscapePass1(node, deferredCharNode, previousSurrogatePart))
  1234. {
  1235. return node; // No quantifier allowed
  1236. }
  1237. break; // else: fall-through for opt quantifier
  1238. case '(':
  1239. ECConsume();
  1240. switch (ECLookahead())
  1241. {
  1242. case '?':
  1243. switch (ECLookahead(1))
  1244. {
  1245. case '=':
  1246. ECConsume(2); // ?=
  1247. node = DisjunctionPass1();
  1248. Assert(ECLookahead() == ')');
  1249. ECConsume(); // )
  1250. node = Anew(ctAllocator, AssertionNode, false, node);
  1251. break; // As per Annex B, allow this to be quantifiable
  1252. case '!':
  1253. ECConsume(2); // ?!
  1254. node = DisjunctionPass1();
  1255. Assert(ECLookahead() == ')');
  1256. ECConsume(); // )
  1257. node = Anew(ctAllocator, AssertionNode, true, node);
  1258. break; // As per Annex B, allow this to be quantifiable
  1259. case ':':
  1260. ECConsume(2); // ?:
  1261. node = DisjunctionPass1();
  1262. Assert(ECLookahead() == ')');
  1263. ECConsume(); // )
  1264. break; // fall-through for opt quantifier
  1265. default:
  1266. {
  1267. // ? not yet consumed
  1268. int thisGroupId = nextGroupId++;
  1269. node = DisjunctionPass1();
  1270. Assert(ECLookahead() == ')');
  1271. ECConsume(); // )
  1272. node = Anew(ctAllocator, DefineGroupNode, thisGroupId, node);
  1273. break; // fall-through for opt quantifier
  1274. }
  1275. }
  1276. break;
  1277. default:
  1278. {
  1279. // next char not yet consumed
  1280. int thisGroupId = nextGroupId++;
  1281. node = DisjunctionPass1();
  1282. Assert(ECLookahead() == ')');
  1283. ECConsume(); // )
  1284. node = Anew(ctAllocator, DefineGroupNode, thisGroupId, node);
  1285. break; // fall-through for opt quantifier
  1286. }
  1287. }
  1288. break;
  1289. case '.':
  1290. {
  1291. ECConsume();
  1292. node = GetNodeWithValidCharacterSet('.');
  1293. break; // fall-through for opt quantifier
  1294. }
  1295. case '[':
  1296. ECConsume();
  1297. if (unicodeFlagPresent)
  1298. {
  1299. containsSurrogatePair = this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->rangeLocation == next;
  1300. }
  1301. node = containsSurrogatePair ? CharacterClassPass1<true>() : CharacterClassPass1<false>();
  1302. Assert(ECLookahead() == ']');
  1303. ECConsume(); // ]
  1304. break; // fall-through for opt quantifier
  1305. #if DBG
  1306. case ')':
  1307. case '|':
  1308. Assert(false);
  1309. break;
  1310. #endif
  1311. case ']':
  1312. // SPEC DEVIATION: This should be syntax error, instead accept as itself
  1313. deferredCharNode->cs[0] = NextChar();
  1314. node = deferredCharNode;
  1315. break; // fall-through for opt quantifier
  1316. #if DBG
  1317. case '*':
  1318. case '+':
  1319. case '?':
  1320. AssertMsg(false, "Allowed only in the escaped form. These should be caught by TermPass0.");
  1321. break;
  1322. #endif
  1323. case 0:
  1324. if (IsEOF())
  1325. // Terminating 0
  1326. Fail(JSERR_RegExpSyntax);
  1327. // else fall-through for embedded 0
  1328. default:
  1329. if(SurrogatePairPass1(node, deferredCharNode, previousSurrogatePart))
  1330. {
  1331. break; //For quantifier
  1332. }
  1333. else
  1334. {
  1335. deferredCharNode->cs[0] = NextChar();
  1336. node = deferredCharNode;
  1337. break; // fall-through for opt quantifier
  1338. }
  1339. }
  1340. Assert(node != 0);
  1341. if (AtQuantifier())
  1342. {
  1343. switch (ECLookahead())
  1344. {
  1345. case '*':
  1346. if (node == deferredCharNode)
  1347. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1348. ECConsume();
  1349. return NewLoopNode(0, CharCountFlag, OptNonGreedy(), node);
  1350. case '+':
  1351. if (node == deferredCharNode)
  1352. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1353. ECConsume();
  1354. return NewLoopNode(1, CharCountFlag, OptNonGreedy(), node);
  1355. case '?':
  1356. if (node == deferredCharNode)
  1357. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1358. ECConsume();
  1359. return NewLoopNode(0, 1, OptNonGreedy(), node);
  1360. case '{':
  1361. {
  1362. if (node == deferredCharNode)
  1363. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1364. ECConsume();
  1365. CharCount lower = RepeatCount();
  1366. switch (ECLookahead())
  1367. {
  1368. case ',':
  1369. ECConsume();
  1370. if (ECLookahead() == '}')
  1371. {
  1372. ECConsume();
  1373. return NewLoopNode(lower, CharCountFlag, OptNonGreedy(), node);
  1374. }
  1375. else
  1376. {
  1377. CharCount upper = RepeatCount();
  1378. Assert(lower <= upper);
  1379. Assert(ECLookahead() == '}');
  1380. ECConsume(); // }
  1381. return NewLoopNode(lower, upper, OptNonGreedy(), node);
  1382. }
  1383. case '}':
  1384. ECConsume();
  1385. return NewLoopNode(lower, lower, OptNonGreedy(), node);
  1386. default:
  1387. Assert(false);
  1388. break;
  1389. }
  1390. break;
  1391. }
  1392. default:
  1393. Assert(false);
  1394. break;
  1395. }
  1396. }
  1397. return node;
  1398. }
  1399. #pragma warning(push)
  1400. #pragma warning(disable:4702) // unreachable code
  1401. template <typename P, const bool IsLiteral>
  1402. bool Parser<P, IsLiteral>::AtomEscapePass0()
  1403. {
  1404. EncodedChar ec = ECLookahead();
  1405. if (ec == 0 && IsEOF())
  1406. {
  1407. // Terminating 0
  1408. Fail(JSERR_RegExpSyntax);
  1409. return false;
  1410. }
  1411. else if (standardEncodedChars->IsDigit(ec))
  1412. {
  1413. do
  1414. {
  1415. ECConsume();
  1416. }
  1417. while (standardEncodedChars->IsDigit(ECLookahead())); // terminating 0 is not a digit
  1418. return false;
  1419. }
  1420. else if (ECLookahead() == 'c')
  1421. {
  1422. if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
  1423. {
  1424. ECConsume(2);
  1425. }
  1426. else
  1427. {
  1428. DeferredFailIfUnicode(JSERR_RegExpInvalidEscape);
  1429. }
  1430. return false;
  1431. }
  1432. else
  1433. {
  1434. const EncodedChar *current = next;
  1435. // An escaped '/' is ok
  1436. Char c = NextChar();
  1437. switch (c)
  1438. {
  1439. case 'b':
  1440. case 'B':
  1441. return true;
  1442. // case 'c': handled as special case above
  1443. case 'x':
  1444. if (ECCanConsume(2) &&
  1445. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1446. standardEncodedChars->IsHex(ECLookahead(1)))
  1447. ECConsume(2);
  1448. break;
  1449. case 'u':
  1450. bool surrogateEncountered = false;
  1451. int lengthOfSurrogate = TryParseExtendedUnicodeEscape(c, surrogateEncountered, true);
  1452. if (lengthOfSurrogate > 0)
  1453. {
  1454. if (surrogateEncountered)
  1455. {
  1456. // If we don't have an allocator, we don't create nodes
  1457. // Asserts in place as extra checks for when we do have an allocator
  1458. Assert(this->ctAllocator == nullptr || this->currentSurrogatePairNode != nullptr);
  1459. Assert(this->ctAllocator == nullptr || current == this->currentSurrogatePairNode->location);
  1460. ECConsume(lengthOfSurrogate);
  1461. }
  1462. //Don't fall through
  1463. break;
  1464. }
  1465. else if (ECCanConsume(4) &&
  1466. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1467. standardEncodedChars->IsHex(ECLookahead(1)) &&
  1468. standardEncodedChars->IsHex(ECLookahead(2)) &&
  1469. standardEncodedChars->IsHex(ECLookahead(3)))
  1470. {
  1471. if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1472. {
  1473. codepoint_t value = (standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  1474. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  1475. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  1476. (standardEncodedChars->DigitValue(ECLookahead(3)));
  1477. TrackIfSurrogatePair(value, (next - 1), 5);
  1478. }
  1479. ECConsume(4);
  1480. }
  1481. break;
  1482. }
  1483. // embedded 0 is ok
  1484. return false;
  1485. }
  1486. }
  1487. #pragma warning(pop)
  1488. template <typename P, const bool IsLiteral>
  1489. bool Parser<P, IsLiteral>::AtomEscapePass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
  1490. {
  1491. Assert(!IsEOF()); // checked for terminating 0 in pass 0
  1492. if (standardEncodedChars->IsDigit(ECLookahead()))
  1493. {
  1494. // As per Annex B, allow octal escapes as well as group references, disambiguate based on known
  1495. // number of groups.
  1496. if (ECLookahead() == '0')
  1497. {
  1498. // fall through for octal
  1499. }
  1500. else
  1501. {
  1502. // Could be a group reference, but only if between 1 and 5 digits which resolve to a valid group number
  1503. int n = 0;
  1504. CharCount digits = 0;
  1505. do
  1506. {
  1507. n = n * 10 + (int)standardEncodedChars->DigitValue(ECLookahead(digits));
  1508. digits++;
  1509. }
  1510. while (digits < 5 && ECCanConsume(digits + 1) && standardEncodedChars->IsDigit(ECLookahead(digits)));
  1511. if (n >= numGroups ||
  1512. (ECCanConsume(digits + 1) && standardEncodedChars->IsDigit(ECLookahead(digits))))
  1513. {
  1514. if (standardEncodedChars->IsOctal(ECLookahead()))
  1515. {
  1516. // fall through for octal
  1517. }
  1518. else
  1519. {
  1520. // \8 and \9 are identity escapes
  1521. deferredCharNode->cs[0] = Chars<EncodedChar>::CTW(ECLookahead());
  1522. ECConsume();
  1523. node = deferredCharNode;
  1524. return false; // not an assertion
  1525. }
  1526. }
  1527. else
  1528. {
  1529. ECConsume(digits);
  1530. node = Anew(ctAllocator, MatchGroupNode, n);
  1531. return false; // not an assertion
  1532. }
  1533. }
  1534. // Must be between 1 and 3 octal digits
  1535. Assert(standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not an octal
  1536. uint n = 0;
  1537. CharCount digits = 0;
  1538. do
  1539. {
  1540. uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
  1541. if (m > Chars<uint8>::MaxUChar) // Regex octal codes only support single byte (ASCII) characters.
  1542. break;
  1543. n = m;
  1544. ECConsume();
  1545. digits++;
  1546. }
  1547. while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not an octal
  1548. deferredCharNode->cs[0] = UTC((UChar)n);
  1549. node = deferredCharNode;
  1550. return false; // not an assertion
  1551. }
  1552. else if (ECLookahead() == 'c')
  1553. {
  1554. Char c;
  1555. if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
  1556. {
  1557. c = UTC(Chars<EncodedChar>::CTU(ECLookahead(1)) % 32);
  1558. ECConsume(2);
  1559. }
  1560. else
  1561. {
  1562. // SPEC DEVIATION: For non-letters or EOF, take the leading '\' to be itself, and
  1563. // don't consume the 'c' or letter.
  1564. c = '\\';
  1565. }
  1566. deferredCharNode->cs[0] = c;
  1567. node = deferredCharNode;
  1568. return false; // not an assertion
  1569. }
  1570. else
  1571. {
  1572. Char c = NextChar();
  1573. switch (c)
  1574. {
  1575. case 'b':
  1576. node = Anew(ctAllocator, WordBoundaryNode, false);
  1577. return true; // Is an assertion
  1578. case 'B':
  1579. node = Anew(ctAllocator, WordBoundaryNode, true);
  1580. return true; // Is an assertion
  1581. case 'f':
  1582. c = '\f';
  1583. break; // fall-through for identity escape
  1584. case 'n':
  1585. c = '\n';
  1586. break; // fall-through for identity escape
  1587. case 'r':
  1588. c = '\r';
  1589. break; // fall-through for identity escape
  1590. case 't':
  1591. c = '\t';
  1592. break; // fall-through for identity escape
  1593. case 'v':
  1594. c = '\v';
  1595. break; // fall-through for identity escape
  1596. case 'd':
  1597. {
  1598. MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
  1599. standardChars->SetDigits(ctAllocator, setNode->set);
  1600. node = setNode;
  1601. return false; // not an assertion
  1602. }
  1603. case 'D':
  1604. {
  1605. node = GetNodeWithValidCharacterSet('D');
  1606. return false; // not an assertion
  1607. }
  1608. case 's':
  1609. {
  1610. MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
  1611. standardChars->SetWhitespace(ctAllocator, setNode->set);
  1612. node = setNode;
  1613. return false; // not an assertion
  1614. }
  1615. case 'S':
  1616. {
  1617. node = GetNodeWithValidCharacterSet('S');
  1618. return false; // not an assertion
  1619. }
  1620. case 'w':
  1621. {
  1622. MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
  1623. if (this->unicodeFlagPresent && this->caseInsensitiveFlagPresent)
  1624. {
  1625. standardChars->SetWordIUChars(ctAllocator, setNode->set);
  1626. }
  1627. else
  1628. {
  1629. standardChars->SetWordChars(ctAllocator, setNode->set);
  1630. }
  1631. node = setNode;
  1632. return false; // not an assertion
  1633. }
  1634. case 'W':
  1635. {
  1636. node = GetNodeWithValidCharacterSet('W');
  1637. return false; // not an assertion
  1638. }
  1639. // case 'c': handled as special case above
  1640. case 'x':
  1641. if (ECCanConsume(2) &&
  1642. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1643. standardEncodedChars->IsHex(ECLookahead(1)))
  1644. {
  1645. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
  1646. (standardEncodedChars->DigitValue(ECLookahead(1))));
  1647. ECConsume(2);
  1648. // fall-through for identity escape
  1649. }
  1650. // Take to be identity escape if ill-formed as per Annex B
  1651. break;
  1652. case 'u':
  1653. if (unicodeFlagPresent && TryParseExtendedUnicodeEscape(c, previousSurrogatePart) > 0)
  1654. break;
  1655. else if (ECCanConsume(4) &&
  1656. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1657. standardEncodedChars->IsHex(ECLookahead(1)) &&
  1658. standardEncodedChars->IsHex(ECLookahead(2)) &&
  1659. standardEncodedChars->IsHex(ECLookahead(3)))
  1660. {
  1661. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  1662. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  1663. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  1664. (standardEncodedChars->DigitValue(ECLookahead(3))));
  1665. ECConsume(4);
  1666. // fall-through for identity escape
  1667. }
  1668. // Take to be identity escape if ill-formed as per Annex B
  1669. break;
  1670. case '^':
  1671. case '$':
  1672. case '\\':
  1673. case '.':
  1674. case '*':
  1675. case '+':
  1676. case '?':
  1677. case '(':
  1678. case ')':
  1679. case '[':
  1680. case ']':
  1681. case '{':
  1682. case '}':
  1683. case '|':
  1684. case '/':
  1685. break; // fall-through for identity escape
  1686. default:
  1687. if (this->unicodeFlagPresent)
  1688. {
  1689. // As per #sec-forbidden-extensions, if unicode flag is present, we must disallow any other escape.
  1690. this->Fail(JSERR_RegExpInvalidEscape); // throw SyntaxError
  1691. }
  1692. // As per Annex B, allow anything other than newlines and above. Embedded 0 is ok
  1693. break;
  1694. }
  1695. // Must be an identity escape
  1696. deferredCharNode->cs[0] = c;
  1697. node = deferredCharNode;
  1698. return false; // not an assertion
  1699. }
  1700. }
  1701. template <typename P, const bool IsLiteral>
  1702. bool Parser<P, IsLiteral>::SurrogatePairPass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
  1703. {
  1704. if (!this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() || !unicodeFlagPresent)
  1705. {
  1706. return false;
  1707. }
  1708. if (this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
  1709. {
  1710. AssertMsg(!this->currentSurrogatePairNode->IsInsideRange(), "Should not be calling this pass if we are currently inside a range.");
  1711. char16 lower, upper;
  1712. uint tableIndex = 0, actualHigh = 0;
  1713. codepoint_t equivClass[CaseInsensitive::EquivClassSize];
  1714. if (caseInsensitiveFlagPresent && CaseInsensitive::RangeToEquivClass(tableIndex, this->currentSurrogatePairNode->value, this->currentSurrogatePairNode->value, actualHigh, equivClass))
  1715. {
  1716. Node *equivNode[CaseInsensitive::EquivClassSize];
  1717. int indexForNextNode = 0;
  1718. for (int i = 0; i < CaseInsensitive::EquivClassSize; i++)
  1719. {
  1720. bool alreadyAdded = false;
  1721. for (int j = 0; j < i; j++)
  1722. {
  1723. if (equivClass[i] == equivClass[j])
  1724. {
  1725. alreadyAdded = true;
  1726. break;
  1727. }
  1728. }
  1729. if (!alreadyAdded)
  1730. {
  1731. if (Js::NumberUtilities::IsInSupplementaryPlane(equivClass[i]))
  1732. {
  1733. Js::NumberUtilities::CodePointAsSurrogatePair(equivClass[i], &lower, &upper);
  1734. equivNode[indexForNextNode] = CreateSurrogatePairAtom(lower, upper);
  1735. }
  1736. else
  1737. {
  1738. equivNode[indexForNextNode] = Anew(ctAllocator, MatchCharNode, (Char)equivClass[i]);
  1739. }
  1740. indexForNextNode ++;
  1741. }
  1742. }
  1743. Assert(indexForNextNode > 0);
  1744. if (indexForNextNode == 1)
  1745. {
  1746. node = equivNode[0];
  1747. }
  1748. else
  1749. {
  1750. AltNode *altNode = Anew(ctAllocator, AltNode, equivNode[0], nullptr);
  1751. AltNode *altNodeTail = altNode;
  1752. for (int i = 1; i < indexForNextNode; i++)
  1753. {
  1754. altNodeTail->tail = Anew(ctAllocator, AltNode, equivNode[i], nullptr);
  1755. altNodeTail = altNodeTail->tail;
  1756. }
  1757. node = altNode;
  1758. }
  1759. }
  1760. else
  1761. {
  1762. Js::NumberUtilities::CodePointAsSurrogatePair(this->currentSurrogatePairNode->value, &lower, &upper);
  1763. node = CreateSurrogatePairAtom(lower, upper);
  1764. }
  1765. previousSurrogatePart = false;
  1766. Assert(ECCanConsume(this->currentSurrogatePairNode->length));
  1767. ECConsumeMultiUnit(this->currentSurrogatePairNode->length);
  1768. this->RestoreMultiUnits(this->currentSurrogatePairNode->multiUnits);
  1769. this->currentSurrogatePairNode = this->currentSurrogatePairNode->next;
  1770. return true;
  1771. }
  1772. return false;
  1773. }
  1774. //
  1775. // Classes
  1776. //
  1777. template <typename P, const bool IsLiteral>
  1778. bool Parser<P, IsLiteral>::AtSecondSingletonClassAtom()
  1779. {
  1780. Assert(ECLookahead() == '-');
  1781. if (ECLookahead(1) == '\\')
  1782. {
  1783. switch (ECLookahead(2))
  1784. {
  1785. case 'd':
  1786. case 'D':
  1787. case 's':
  1788. case 'S':
  1789. case 'w':
  1790. case 'W':
  1791. // These all denote non-singleton sets
  1792. return false;
  1793. default:
  1794. // fall-through for singleton
  1795. break;
  1796. }
  1797. }
  1798. return true;
  1799. }
  1800. template <typename P, const bool IsLiteral>
  1801. void Parser<P, IsLiteral>::CharacterClassPass0()
  1802. {
  1803. // Could be terminating 0
  1804. if (ECLookahead() == '^')
  1805. ECConsume();
  1806. EncodedChar nextChar = ECLookahead();
  1807. const EncodedChar* current;
  1808. codepoint_t lastCodepoint = INVALID_CODEPOINT;
  1809. codepoint_t pendingRangeStart = INVALID_CODEPOINT;
  1810. codepoint_t pendingRangeEnd = INVALID_CODEPOINT;
  1811. bool previousSurrogatePart = false;
  1812. while(nextChar != ']')
  1813. {
  1814. current = next;
  1815. if (nextChar == '\0' && IsEOF())
  1816. {
  1817. // Report as unclosed '['
  1818. Fail(JSERR_RegExpNoBracket);
  1819. return;
  1820. } // Otherwise embedded '\0' is ok
  1821. else if (nextChar == '\\')
  1822. {
  1823. // Consume, as classescapepass0 expects for it to be consumed
  1824. Char outChar = NextChar();
  1825. // If previousSurrogatePart = true upon leaving this method, then we are going to pass through here twice
  1826. // This is because \u{} escape sequence was encountered that is actually 2 characters, the second time we will pass consuming entire character
  1827. if (ClassEscapePass0(outChar, previousSurrogatePart))
  1828. {
  1829. lastCodepoint = outChar;
  1830. }
  1831. else
  1832. {
  1833. // Last codepoint isn't a singleton, so no codepoint tracking for the sake of ranges is needed.
  1834. lastCodepoint = INVALID_CODEPOINT;
  1835. // Unless we have a possible range end, cancel our range tracking.
  1836. if (pendingRangeEnd == INVALID_CODEPOINT)
  1837. {
  1838. pendingRangeStart = INVALID_CODEPOINT;
  1839. }
  1840. }
  1841. }
  1842. else if (nextChar == '-')
  1843. {
  1844. if (pendingRangeStart != INVALID_CODEPOINT || lastCodepoint == INVALID_CODEPOINT)
  1845. {
  1846. // '-' is the upper part of the range, with pendingRangeStart codepoint is the lower. Set lastCodePoint to '-' to check at the end of the while statement
  1847. // OR
  1848. // '-' is just a char, consume it and set as last char
  1849. lastCodepoint = NextChar();
  1850. }
  1851. else
  1852. {
  1853. pendingRangeStart = this->next == this->positionAfterLastSurrogate
  1854. ? this->valueOfLastSurrogate
  1855. : lastCodepoint;
  1856. lastCodepoint = INVALID_CODEPOINT;
  1857. NextChar();
  1858. }
  1859. // If we have a pattern of the form [\ud800-\udfff], we need this to be interpreted as a range.
  1860. // In order to achieve this, the two variables that we use to track surrogate pairs, namely
  1861. // tempLocationOfSurrogatePair and previousSurrogatePart, need to be in a certain state.
  1862. //
  1863. // We need to reset tempLocationOfSurrogatePair as it points to the first Unicode escape (\ud800)
  1864. // when we're here. We need to clear it in order not to have a surrogate pair when we process the
  1865. // second escape (\udfff).
  1866. //
  1867. // previousSurrogatePart is used when we have a code point in the \u{...} extended format and the
  1868. // character is in a supplementary plane. However, there is no need to change its value here. When
  1869. // such an escape sequence is encountered, the first call to ClassEscapePass0() sets the variable
  1870. // to true, but it rewinds the input back to the beginning of the escape sequence. The next
  1871. // iteration of the loop here will again call ClassEscape0() with the same character and the
  1872. // variable will this time be set to false. Therefore, the variable will always be false here.
  1873. tempLocationOfSurrogatePair = nullptr;
  1874. Assert(!previousSurrogatePart);
  1875. }
  1876. else
  1877. {
  1878. lastCodepoint = NextChar();
  1879. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1880. {
  1881. TrackIfSurrogatePair(lastCodepoint, current, (uint32)(next - current));
  1882. }
  1883. }
  1884. if (!scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1885. {
  1886. // This is much easier to handle.
  1887. if (pendingRangeStart != INVALID_CODEPOINT && lastCodepoint != INVALID_CODEPOINT)
  1888. {
  1889. Assert(pendingRangeStart < 0x10000);
  1890. Assert(pendingRangeEnd == INVALID_CODEPOINT);
  1891. if (pendingRangeStart > lastCodepoint)
  1892. {
  1893. Fail(JSERR_RegExpBadRange);
  1894. }
  1895. pendingRangeStart = lastCodepoint = INVALID_CODEPOINT;
  1896. }
  1897. }
  1898. // We have a candidate for range end, and we have a range start.
  1899. else if (pendingRangeStart != INVALID_CODEPOINT && (lastCodepoint != INVALID_CODEPOINT || pendingRangeEnd != INVALID_CODEPOINT))
  1900. {
  1901. // The following will be true at the end of each surrogate pair parse.
  1902. // Note, the escape sequence \u{} is a two time parse, so this will be true on the second time around.
  1903. if (this->next == this->positionAfterLastSurrogate)
  1904. {
  1905. lastCodepoint = this->valueOfLastSurrogate;
  1906. Assert(!previousSurrogatePart);
  1907. pendingRangeEnd = lastCodepoint;
  1908. lastCodepoint = INVALID_CODEPOINT;
  1909. }
  1910. // If the next character is the end of range ']', then we can't have a surrogate pair.
  1911. // The current character is the range end, if we don't already have a candidate.
  1912. else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)
  1913. {
  1914. pendingRangeEnd = lastCodepoint;
  1915. }
  1916. //If we get here, and pendingRangeEnd is set. Then one of the above has caused it to be set, or the previous iteration of the loop.
  1917. if (pendingRangeEnd != INVALID_CODEPOINT)
  1918. {
  1919. char16 leftSingleChar, rightSingleChar, ignore;
  1920. if (pendingRangeStart >= 0x10000)
  1921. {
  1922. Js::NumberUtilities::CodePointAsSurrogatePair(pendingRangeStart, &ignore, &leftSingleChar);
  1923. }
  1924. else
  1925. {
  1926. leftSingleChar = (char16)pendingRangeStart;
  1927. }
  1928. if (pendingRangeEnd >= 0x10000)
  1929. {
  1930. Js::NumberUtilities::CodePointAsSurrogatePair(pendingRangeEnd, &rightSingleChar, &ignore);
  1931. }
  1932. else
  1933. {
  1934. rightSingleChar = (char16)pendingRangeEnd;
  1935. }
  1936. // Here it is a bit tricky, we don't know if we have a unicode option specified.
  1937. // If it is, then \ud800\udc00 - \ud800\udc01 is valid, otherwise invalid.
  1938. if (pendingRangeStart < 0x10000 && pendingRangeEnd < 0x10000 && pendingRangeStart > pendingRangeEnd)
  1939. {
  1940. Fail(JSERR_RegExpBadRange);
  1941. }
  1942. else
  1943. {
  1944. if(leftSingleChar > rightSingleChar)
  1945. {
  1946. DeferredFailIfNotUnicode(JSERR_RegExpBadRange);
  1947. }
  1948. if (pendingRangeStart > pendingRangeEnd)
  1949. {
  1950. DeferredFailIfUnicode(JSERR_RegExpBadRange);
  1951. }
  1952. }
  1953. pendingRangeStart = pendingRangeEnd = INVALID_CODEPOINT;
  1954. }
  1955. // The current char < 0x10000 is a candidate for the range end, but we need to iterate one more time.
  1956. else
  1957. {
  1958. pendingRangeEnd = lastCodepoint;
  1959. }
  1960. }
  1961. nextChar = ECLookahead();
  1962. }
  1963. // We should never have a pendingRangeEnd set when we exit the loop
  1964. Assert(pendingRangeEnd == INVALID_CODEPOINT);
  1965. }
  1966. template <typename P, const bool IsLiteral>
  1967. template <bool containsSurrogates>
  1968. Node* Parser<P, IsLiteral>::CharacterClassPass1()
  1969. {
  1970. Assert(containsSurrogates ? unicodeFlagPresent : true);
  1971. CharSet<codepoint_t> codePointSet;
  1972. MatchSetNode deferredSetNode(false, false);
  1973. MatchCharNode deferredCharNode(0);
  1974. bool isNegation = false;
  1975. if (ECLookahead() == '^')
  1976. {
  1977. isNegation = true;
  1978. ECConsume();
  1979. }
  1980. // We aren't expecting any terminating null characters, only embedded ones that should treated as valid characters.
  1981. // CharacterClassPass0 should have taken care of terminating null.
  1982. codepoint_t pendingCodePoint = INVALID_CODEPOINT;
  1983. codepoint_t pendingRangeStart = INVALID_CODEPOINT;
  1984. EncodedChar nextChar = ECLookahead();
  1985. bool previousWasASurrogate = false;
  1986. bool currIsACharSet = false;
  1987. bool prevWasACharSetAndPartOfRange = false;
  1988. bool prevprevWasACharSetAndPartOfRange = false;
  1989. while(nextChar != ']')
  1990. {
  1991. codepoint_t codePointToSet = INVALID_CODEPOINT;
  1992. // Consume ahead of time if we have two backslashes, both cases below (previously Tracked surrogate pair, and ClassEscapePass1) assume it is.
  1993. if (nextChar == '\\')
  1994. {
  1995. ECConsume();
  1996. }
  1997. // These if-blocks are the logical ClassAtomPass1, they weren't grouped into a method to simplify dealing with multiple out parameters.
  1998. if (containsSurrogates && this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
  1999. {
  2000. codePointToSet = pendingCodePoint;
  2001. pendingCodePoint = this->currentSurrogatePairNode->value;
  2002. Assert(ECCanConsume(this->currentSurrogatePairNode->length));
  2003. ECConsumeMultiUnit(this->currentSurrogatePairNode->length);
  2004. this->RestoreMultiUnits(this->currentSurrogatePairNode->multiUnits);
  2005. this->currentSurrogatePairNode = this->currentSurrogatePairNode->next;
  2006. }
  2007. else if (nextChar == '\\')
  2008. {
  2009. Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);
  2010. codePointToSet = pendingCodePoint;
  2011. if (returnedNode->tag == Node::MatchSet)
  2012. {
  2013. if (pendingRangeStart != INVALID_CODEPOINT)
  2014. {
  2015. if (unicodeFlagPresent)
  2016. {
  2017. //A range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
  2018. //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
  2019. Fail(JSERR_UnicodeRegExpRangeContainsCharClass); //From #sec-patterns-static-semantics-early-errors-annexb
  2020. }
  2021. codePointSet.Set(ctAllocator, '-');
  2022. }
  2023. pendingCodePoint = INVALID_CODEPOINT;
  2024. pendingRangeStart = INVALID_CODEPOINT;
  2025. codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);
  2026. currIsACharSet = true;
  2027. }
  2028. else
  2029. {
  2030. // Just a character
  2031. pendingCodePoint = deferredCharNode.cs[0];
  2032. }
  2033. }
  2034. else if (nextChar == '-')
  2035. {
  2036. if (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT || ECLookahead(1) == ']')
  2037. {
  2038. // - is just a char, or end of a range.
  2039. codePointToSet = pendingCodePoint;
  2040. pendingCodePoint = '-';
  2041. ECConsume();
  2042. }
  2043. else
  2044. {
  2045. pendingRangeStart = pendingCodePoint;
  2046. ECConsume();
  2047. }
  2048. }
  2049. else
  2050. {
  2051. // Just a character, consume it
  2052. codePointToSet = pendingCodePoint;
  2053. pendingCodePoint = NextChar();
  2054. }
  2055. if (codePointToSet != INVALID_CODEPOINT || prevprevWasACharSetAndPartOfRange)
  2056. {
  2057. if (prevprevWasACharSetAndPartOfRange)
  2058. {
  2059. //A range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
  2060. //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
  2061. if (unicodeFlagPresent)
  2062. {
  2063. Fail(JSERR_UnicodeRegExpRangeContainsCharClass);
  2064. }
  2065. if (pendingCodePoint != INVALID_CODEPOINT)
  2066. {
  2067. codePointSet.Set(ctAllocator, pendingCodePoint);
  2068. }
  2069. codePointSet.Set(ctAllocator, '-'); //Add '-' to set because a range was detected but turned out to be a union of character set with '-' and another atom.
  2070. pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
  2071. }
  2072. else if (pendingRangeStart != INVALID_CODEPOINT)
  2073. {
  2074. if (pendingRangeStart > pendingCodePoint)
  2075. {
  2076. //We have no unicodeFlag, but current range contains surrogates, thus we may end up having to throw a "Syntax" error here
  2077. //This breaks the notion of Pass0 check for valid syntax, because we don't know if we have a unicode option
  2078. Assert(!unicodeFlagPresent);
  2079. Fail(JSERR_RegExpBadRange);
  2080. }
  2081. codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
  2082. pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
  2083. }
  2084. else
  2085. {
  2086. codePointSet.Set(ctAllocator, codePointToSet);
  2087. }
  2088. }
  2089. nextChar = ECLookahead();
  2090. prevprevWasACharSetAndPartOfRange = prevWasACharSetAndPartOfRange;
  2091. prevWasACharSetAndPartOfRange = currIsACharSet && nextChar == '-';
  2092. currIsACharSet = false;
  2093. }
  2094. if (pendingCodePoint != INVALID_CODEPOINT)
  2095. {
  2096. codePointSet.Set(ctAllocator, pendingCodePoint);
  2097. }
  2098. // At this point, we have a complete set of codepoints representing the range.
  2099. // Before performing translation of any kind, we need to do some case filling.
  2100. // At the point of this comment, there are no case mappings going cross-plane between simple
  2101. // characters (< 0x10000) and supplementary characters (>= 0x10000)
  2102. // However, it might still be the case, and this has to be handled.
  2103. // On the other hand, we don't want to prevent optimizations that expect non-casefolded sets from happening.
  2104. // At least for simple characters.
  2105. // The simple case, is when the unicode flag isn't specified, we can go ahead and return the simple set.
  2106. // Negations and case mappings will be handled later.
  2107. if (!unicodeFlagPresent)
  2108. {
  2109. Assert(codePointSet.SimpleCharCount() == codePointSet.Count());
  2110. MatchSetNode *simpleToReturn = Anew(ctAllocator, MatchSetNode, isNegation);
  2111. codePointSet.CloneSimpleCharsTo(ctAllocator, simpleToReturn->set);
  2112. return simpleToReturn;
  2113. }
  2114. // Everything past here must be under the flag
  2115. Assert(scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  2116. if (codePointSet.IsEmpty())
  2117. {
  2118. return Anew(ctAllocator, MatchSetNode, false, false);
  2119. }
  2120. Node* prefixNode = nullptr;
  2121. Node* suffixNode = nullptr;
  2122. CharSet<codepoint_t> *toUseForTranslation = &codePointSet;
  2123. // If a singleton, return a simple character
  2124. bool isSingleton = !this->caseInsensitiveFlagPresent && !isNegation && codePointSet.IsSingleton();
  2125. if (isSingleton)
  2126. {
  2127. codepoint_t singleton = codePointSet.Singleton();
  2128. Node* toReturn = nullptr;
  2129. if (singleton < 0x10000)
  2130. {
  2131. toReturn = Anew(ctAllocator, MatchCharNode, (char16)singleton);
  2132. }
  2133. else
  2134. {
  2135. Assert(unicodeFlagPresent);
  2136. char16 lowerSurrogate, upperSurrogate;
  2137. Js::NumberUtilities::CodePointAsSurrogatePair(singleton, &lowerSurrogate, &upperSurrogate);
  2138. toReturn = CreateSurrogatePairAtom(lowerSurrogate, upperSurrogate);
  2139. }
  2140. codePointSet.Clear(ctAllocator);
  2141. return toReturn;
  2142. }
  2143. // If negation, we want to complement the simple chars.
  2144. // When a set is negated, optimizations skip checking if applicable, so we can go ahead and negate it here.
  2145. CharSet<codepoint_t> negatedSet;
  2146. if (!this->caseInsensitiveFlagPresent)
  2147. {
  2148. if (isNegation)
  2149. {
  2150. // Complement all characters, and use it as the set toTranslate
  2151. codePointSet.ToComplement(ctAllocator, negatedSet);
  2152. }
  2153. toUseForTranslation = isNegation ? &negatedSet : &codePointSet;
  2154. if (isNegation)
  2155. {
  2156. // Clear this, as we will no longer need this.
  2157. codePointSet.FreeBody(ctAllocator);
  2158. }
  2159. }
  2160. else
  2161. {
  2162. CharSet<codepoint_t> caseEquivalent;
  2163. codePointSet.ToEquivClass(ctAllocator, caseEquivalent);
  2164. // Equiv set can't have a reduced count of chars
  2165. Assert(caseEquivalent.Count() >= codePointSet.Count());
  2166. // Here we have a regex that has both case insensitive and unicode options.
  2167. // The range might also be negated. If it is negated, we can go ahead and negate
  2168. // the entire set as well as fill in cases, as optimizations wouldn't kick in anyways.
  2169. if (isNegation)
  2170. {
  2171. codePointSet.Clear(ctAllocator);
  2172. caseEquivalent.ToComplement(ctAllocator, codePointSet);
  2173. caseEquivalent.FreeBody(ctAllocator);
  2174. }
  2175. else
  2176. {
  2177. codePointSet.CloneFrom(ctAllocator, caseEquivalent);
  2178. }
  2179. Assert(toUseForTranslation == &codePointSet);
  2180. }
  2181. uint totalCodePointsCount = toUseForTranslation->Count();
  2182. uint simpleCharsCount = toUseForTranslation->SimpleCharCount();
  2183. if (totalCodePointsCount == simpleCharsCount)
  2184. {
  2185. MatchSetNode *simpleToReturn = Anew(ctAllocator, MatchSetNode, isNegation);
  2186. toUseForTranslation->CloneSimpleCharsTo(ctAllocator, simpleToReturn->set);
  2187. return simpleToReturn;
  2188. }
  2189. if (simpleCharsCount > 0)
  2190. {
  2191. if (!toUseForTranslation->ContainSurrogateCodeUnits())
  2192. {
  2193. MatchSetNode *node = Anew(ctAllocator, MatchSetNode, false, false);
  2194. toUseForTranslation->CloneSimpleCharsTo(ctAllocator, node->set);
  2195. prefixNode = node;
  2196. }
  2197. else
  2198. {
  2199. MatchSetNode *node = Anew(ctAllocator, MatchSetNode, false, false);
  2200. toUseForTranslation->CloneNonSurrogateCodeUnitsTo(ctAllocator, node->set);
  2201. prefixNode = node;
  2202. node = Anew(ctAllocator, MatchSetNode, false, false);
  2203. toUseForTranslation->CloneSurrogateCodeUnitsTo(ctAllocator, node->set);
  2204. suffixNode = node;
  2205. }
  2206. }
  2207. Assert(unicodeFlagPresent);
  2208. AltNode *headToReturn = prefixNode == nullptr ? nullptr : Anew(ctAllocator, AltNode, prefixNode, nullptr);
  2209. AltNode *currentTail = headToReturn;
  2210. codepoint_t charRangeSearchIndex = 0x10000, lowerCharOfRange = 0, upperCharOfRange = 0;
  2211. while (toUseForTranslation->GetNextRange(charRangeSearchIndex, &lowerCharOfRange, &upperCharOfRange))
  2212. {
  2213. if (lowerCharOfRange == upperCharOfRange)
  2214. {
  2215. currentTail = this->AppendSurrogatePairToDisjunction(lowerCharOfRange, currentTail);
  2216. }
  2217. else
  2218. {
  2219. currentTail = this->AppendSurrogateRangeToDisjunction(lowerCharOfRange, upperCharOfRange, currentTail);
  2220. }
  2221. if (headToReturn == nullptr)
  2222. {
  2223. headToReturn = currentTail;
  2224. }
  2225. AnalysisAssert(currentTail != nullptr);
  2226. while (currentTail->tail != nullptr)
  2227. {
  2228. currentTail = currentTail->tail;
  2229. }
  2230. charRangeSearchIndex = upperCharOfRange + 1;
  2231. }
  2232. if (suffixNode != nullptr)
  2233. {
  2234. currentTail->tail = Anew(ctAllocator, AltNode, suffixNode, nullptr);
  2235. }
  2236. toUseForTranslation->Clear(ctAllocator);
  2237. if (headToReturn != nullptr && headToReturn->tail == nullptr)
  2238. {
  2239. return headToReturn->head;
  2240. }
  2241. return headToReturn;
  2242. }
  2243. #pragma warning(push)
  2244. #pragma warning(disable:4702) // unreachable code
  2245. template <typename P, const bool IsLiteral>
  2246. bool Parser<P, IsLiteral>::ClassEscapePass0(Char& singleton, bool& previousSurrogatePart)
  2247. {
  2248. // Could be terminating 0
  2249. EncodedChar ec = ECLookahead();
  2250. if (ec == 0 && IsEOF())
  2251. {
  2252. Fail(JSERR_RegExpSyntax);
  2253. return false;
  2254. }
  2255. else if (standardEncodedChars->IsOctal(ec))
  2256. {
  2257. uint n = 0;
  2258. CharCount digits = 0;
  2259. do
  2260. {
  2261. uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
  2262. if (m > Chars<uint8>::MaxUChar) //Regex octal codes only support single byte (ASCII) characters.
  2263. break;
  2264. n = m;
  2265. ECConsume();
  2266. digits++;
  2267. }
  2268. while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not octal
  2269. singleton = UTC((UChar)n);
  2270. // Clear possible pair
  2271. this->tempLocationOfSurrogatePair = nullptr;
  2272. return true;
  2273. }
  2274. else
  2275. {
  2276. const EncodedChar* location = this->tempLocationOfSurrogatePair;
  2277. // Clear it for now, otherwise to many branches to clear it on.
  2278. this->tempLocationOfSurrogatePair = nullptr;
  2279. // An escaped '/' is ok
  2280. Char c = NextChar();
  2281. switch (c)
  2282. {
  2283. case 'b':
  2284. singleton = '\b';
  2285. return true;
  2286. case 'f':
  2287. singleton = '\f';
  2288. return true;
  2289. case 'n':
  2290. singleton = '\n';
  2291. return true;
  2292. case 'r':
  2293. singleton = '\r';
  2294. return true;
  2295. case 't':
  2296. singleton = '\t';
  2297. return true;
  2298. case 'v':
  2299. singleton = '\v';
  2300. return true;
  2301. case 'd':
  2302. case 'D':
  2303. case 's':
  2304. case 'S':
  2305. case 'w':
  2306. case 'W':
  2307. return false;
  2308. case 'c':
  2309. if (!standardEncodedChars->IsLetter(ECLookahead())) //Letter set [A-Z, a-z]
  2310. {
  2311. // Fail in unicode mode for non-letter escaped control characters according to 262 Annex-B RegExp grammar spec #prod-annexB-Term
  2312. DeferredFailIfUnicode(JSERR_RegExpInvalidEscape);
  2313. }
  2314. if (standardEncodedChars->IsWord(ECLookahead())) // word set [A-Z,a-z,0-9,_], terminating 0 is not a word character
  2315. {
  2316. singleton = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
  2317. ECConsume();
  2318. }
  2319. else
  2320. {
  2321. // If the lookahead is a non-alphanumeric and not an underscore ('_'), then treat '\' and 'c' separately.
  2322. //#sec-regular-expression-patterns-semantics
  2323. ECRevert(1); //Put cursor back at 'c' and treat it as a non-escaped character.
  2324. singleton = '\\';
  2325. }
  2326. return true;
  2327. case 'x':
  2328. if (ECCanConsume(2) &&
  2329. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2330. standardEncodedChars->IsHex(ECLookahead(1)))
  2331. {
  2332. singleton = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
  2333. (standardEncodedChars->DigitValue(ECLookahead(1))));
  2334. ECConsume(2);
  2335. }
  2336. else
  2337. singleton = c;
  2338. return true;
  2339. case 'u':
  2340. this->tempLocationOfSurrogatePair = location;
  2341. if (this->TryParseExtendedUnicodeEscape(singleton, previousSurrogatePart, true) > 0)
  2342. return true;
  2343. else if (ECCanConsume(4) &&
  2344. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2345. standardEncodedChars->IsHex(ECLookahead(1)) &&
  2346. standardEncodedChars->IsHex(ECLookahead(2)) &&
  2347. standardEncodedChars->IsHex(ECLookahead(3)))
  2348. {
  2349. singleton = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  2350. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  2351. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  2352. (standardEncodedChars->DigitValue(ECLookahead(3))));
  2353. if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  2354. {
  2355. // Current location
  2356. TrackIfSurrogatePair(singleton, (next - 1), 5);
  2357. }
  2358. // The above if statement, if true, will clear tempLocationOfSurrogatePair if needs to.
  2359. ECConsume(4);
  2360. }
  2361. else
  2362. singleton = c;
  2363. return true;
  2364. default:
  2365. // embedded 0 is ok
  2366. singleton = c;
  2367. return true;
  2368. }
  2369. }
  2370. }
  2371. #pragma warning(pop)
  2372. template <typename P, const bool IsLiteral>
  2373. Node* Parser<P, IsLiteral>::ClassEscapePass1(MatchCharNode* deferredCharNode, MatchSetNode* deferredSetNode, bool& previousSurrogatePart)
  2374. {
  2375. // Checked for terminating 0 is pass 0
  2376. Assert(!IsEOF());
  2377. if (standardEncodedChars->IsOctal(ECLookahead()))
  2378. {
  2379. // As per Annex B, allow octal escapes instead of just \0 (and \8 and \9 are identity escapes).
  2380. // Must be between 1 and 3 octal digits.
  2381. uint n = 0;
  2382. CharCount digits = 0;
  2383. do
  2384. {
  2385. uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
  2386. if (m > Chars<uint8>::MaxUChar) //Regex octal codes only support single byte (ASCII) characters.
  2387. break;
  2388. n = m;
  2389. ECConsume();
  2390. digits++;
  2391. }
  2392. while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not octal
  2393. deferredCharNode->cs[0] = UTC((UChar)n);
  2394. return deferredCharNode;
  2395. }
  2396. else
  2397. {
  2398. Char c = NextChar();
  2399. switch (c)
  2400. {
  2401. case 'b':
  2402. c = '\b';
  2403. break; // fall-through for identity escape
  2404. case 'f':
  2405. c = '\f';
  2406. break; // fall-through for identity escape
  2407. case 'n':
  2408. c = '\n';
  2409. break; // fall-through for identity escape
  2410. case 'r':
  2411. c = '\r';
  2412. break; // fall-through for identity escape
  2413. case 't':
  2414. c = '\t';
  2415. break; // fall-through for identity escape
  2416. case 'v':
  2417. c = '\v';
  2418. break; // fall-through for identity escape
  2419. case 'd':
  2420. standardChars->SetDigits(ctAllocator, deferredSetNode->set);
  2421. return deferredSetNode;
  2422. case 'D':
  2423. standardChars->SetNonDigits(ctAllocator, deferredSetNode->set);
  2424. return deferredSetNode;
  2425. case 's':
  2426. standardChars->SetWhitespace(ctAllocator, deferredSetNode->set);
  2427. return deferredSetNode;
  2428. case 'S':
  2429. standardChars->SetNonWhitespace(ctAllocator, deferredSetNode->set);
  2430. return deferredSetNode;
  2431. case 'w':
  2432. standardChars->SetWordChars(ctAllocator, deferredSetNode->set);
  2433. return deferredSetNode;
  2434. case 'W':
  2435. standardChars->SetNonWordChars(ctAllocator, deferredSetNode->set);
  2436. return deferredSetNode;
  2437. case 'c':
  2438. if (standardEncodedChars->IsWord(ECLookahead())) // terminating 0 is not a word character
  2439. {
  2440. c = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
  2441. ECConsume();
  2442. // fall-through for identity escape
  2443. }
  2444. else
  2445. {
  2446. // If the lookahead is a non-alphanumeric and not an underscore ('_'), then treat '\' and 'c' separately.
  2447. //#sec-regular-expression-patterns-semantics
  2448. ECRevert(1); //Put cursor back at 'c' and treat it as a non-escaped character.
  2449. deferredCharNode->cs[0] = '\\';
  2450. return deferredCharNode;
  2451. }
  2452. break;
  2453. case 'x':
  2454. if (ECCanConsume(2) &&
  2455. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2456. standardEncodedChars->IsHex(ECLookahead(1)))
  2457. {
  2458. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
  2459. (standardEncodedChars->DigitValue(ECLookahead(1))));
  2460. ECConsume(2);
  2461. // fall-through for identity escape
  2462. }
  2463. // Take to be identity escape if ill-formed as per Annex B
  2464. break;
  2465. case 'u':
  2466. if (unicodeFlagPresent && TryParseExtendedUnicodeEscape(c, previousSurrogatePart) > 0)
  2467. break;
  2468. else if (ECCanConsume(4) &&
  2469. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2470. standardEncodedChars->IsHex(ECLookahead(1)) &&
  2471. standardEncodedChars->IsHex(ECLookahead(2)) &&
  2472. standardEncodedChars->IsHex(ECLookahead(3)))
  2473. {
  2474. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  2475. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  2476. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  2477. (standardEncodedChars->DigitValue(ECLookahead(3))));
  2478. ECConsume(4);
  2479. // fall-through for identity escape
  2480. }
  2481. // Take to be identity escape if ill-formed as per Annex B.
  2482. break;
  2483. default:
  2484. // As per Annex B, allow anything other than newlines and above. Embedded 0 is ok.
  2485. break;
  2486. }
  2487. // Must be an identity escape
  2488. deferredCharNode->cs[0] = c;
  2489. return deferredCharNode;
  2490. }
  2491. }
  2492. //
  2493. // Options
  2494. //
  2495. template <typename P, const bool IsLiteral>
  2496. void Parser<P, IsLiteral>::Options(RegexFlags& flags)
  2497. {
  2498. while (true)
  2499. {
  2500. // Could be terminating 0
  2501. EncodedChar ec = ECLookahead();
  2502. CharCount consume;
  2503. Char c;
  2504. if (ec == 0)
  2505. // Embedded 0 not valid
  2506. return;
  2507. else if (IsLiteral &&
  2508. ec == '\\' &&
  2509. ECCanConsume(6) &&
  2510. ECLookahead(1) == 'u' &&
  2511. standardEncodedChars->IsHex(ECLookahead(2)) &&
  2512. standardEncodedChars->IsHex(ECLookahead(3)) &&
  2513. standardEncodedChars->IsHex(ECLookahead(4)) &&
  2514. standardEncodedChars->IsHex(ECLookahead(5)))
  2515. {
  2516. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  2517. {
  2518. Fail(JSERR_RegExpSyntax);
  2519. return;
  2520. }
  2521. else
  2522. {
  2523. uint32 n = (standardEncodedChars->DigitValue(ECLookahead(2)) << 12) |
  2524. (standardEncodedChars->DigitValue(ECLookahead(3)) << 8) |
  2525. (standardEncodedChars->DigitValue(ECLookahead(4)) << 4) |
  2526. (standardEncodedChars->DigitValue(ECLookahead(5)));
  2527. c = UTC(n);
  2528. consume = 6;
  2529. }
  2530. }
  2531. else
  2532. {
  2533. c = Chars<EncodedChar>::CTW(ec);
  2534. consume = 1;
  2535. }
  2536. switch (c) {
  2537. case 'i':
  2538. if ((flags & IgnoreCaseRegexFlag) != 0)
  2539. {
  2540. Fail(JSERR_RegExpSyntax);
  2541. }
  2542. flags = (RegexFlags)(flags | IgnoreCaseRegexFlag);
  2543. break;
  2544. case 'g':
  2545. if ((flags & GlobalRegexFlag) != 0)
  2546. {
  2547. Fail(JSERR_RegExpSyntax);
  2548. }
  2549. flags = (RegexFlags)(flags | GlobalRegexFlag);
  2550. break;
  2551. case 'm':
  2552. if ((flags & MultilineRegexFlag) != 0)
  2553. {
  2554. Fail(JSERR_RegExpSyntax);
  2555. }
  2556. flags = (RegexFlags)(flags | MultilineRegexFlag);
  2557. break;
  2558. case 'u':
  2559. // If we don't have unicode enabled, fall through to default
  2560. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  2561. {
  2562. if ((flags & UnicodeRegexFlag) != 0)
  2563. {
  2564. Fail(JSERR_RegExpSyntax);
  2565. }
  2566. flags = (RegexFlags)(flags | UnicodeRegexFlag);
  2567. // For telemetry
  2568. CHAKRATEL_LANGSTATS_INC_LANGFEATURECOUNT(ES6, UnicodeRegexFlag, scriptContext);
  2569. break;
  2570. }
  2571. case 'y':
  2572. if (scriptContext->GetConfig()->IsES6RegExStickyEnabled())
  2573. {
  2574. if ((flags & StickyRegexFlag) != 0)
  2575. {
  2576. Fail(JSERR_RegExpSyntax);
  2577. }
  2578. flags = (RegexFlags)(flags | StickyRegexFlag);
  2579. // For telemetry
  2580. CHAKRATEL_LANGSTATS_INC_LANGFEATURECOUNT(ES6, StickyRegexFlag, scriptContext);
  2581. break;
  2582. }
  2583. default:
  2584. if (standardChars->IsWord(c))
  2585. {
  2586. // Outer context could never parse this character. Signal the syntax error as
  2587. // being part of the regex.
  2588. Fail(JSERR_RegExpSyntax);
  2589. }
  2590. return;
  2591. }
  2592. ECConsume(consume);
  2593. }
  2594. }
  2595. //
  2596. // Entry points
  2597. //
  2598. template <typename P, const bool IsLiteral>
  2599. Node* Parser<P, IsLiteral>::ParseDynamic
  2600. ( const EncodedChar* body
  2601. , const EncodedChar* bodyLim
  2602. , const EncodedChar* opts
  2603. , const EncodedChar* optsLim
  2604. , RegexFlags& flags )
  2605. {
  2606. Assert(!IsLiteral);
  2607. Assert(body != 0);
  2608. Assert(bodyLim >= body && *bodyLim == 0);
  2609. Assert(opts == 0 || (optsLim >= opts && *optsLim == 0));
  2610. // Body, pass 0
  2611. SetPosition(body, bodyLim, true);
  2612. PatternPass0();
  2613. if (!IsEOF())
  2614. Fail(JSERR_RegExpSyntax);
  2615. // Options
  2616. if (opts != 0)
  2617. {
  2618. SetPosition(opts, optsLim, false);
  2619. Options(flags);
  2620. if (!IsEOF())
  2621. Fail(JSERR_RegExpSyntax);
  2622. this->unicodeFlagPresent = (flags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
  2623. this->caseInsensitiveFlagPresent = (flags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
  2624. Assert(!this->unicodeFlagPresent || scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  2625. }
  2626. else
  2627. {
  2628. this->unicodeFlagPresent = false;
  2629. this->caseInsensitiveFlagPresent = false;
  2630. }
  2631. // If this HR has been set, that means we have an earlier failure than the one caught above.
  2632. if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
  2633. {
  2634. throw ParseError(*deferredIfNotUnicodeError);
  2635. }
  2636. else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
  2637. {
  2638. throw ParseError(*deferredIfUnicodeError);
  2639. }
  2640. this->currentSurrogatePairNode = this->surrogatePairList;
  2641. // Body, pass 1
  2642. SetPosition(body, bodyLim, true);
  2643. Node* root = PatternPass1();
  2644. Assert(IsEOF());
  2645. return root;
  2646. }
  2647. template <typename P, const bool IsLiteral>
  2648. Node* Parser<P, IsLiteral>::ParseLiteral
  2649. ( const EncodedChar* input
  2650. , const EncodedChar* inputLim
  2651. , CharCount& outBodyEncodedChars
  2652. , CharCount& outTotalEncodedChars
  2653. , CharCount& outBodyChars
  2654. , CharCount& outTotalChars
  2655. , RegexFlags& flags )
  2656. {
  2657. Assert(IsLiteral);
  2658. Assert(input != 0);
  2659. Assert(inputLim >= input); // *inputLim need not be 0 because of deferred parsing
  2660. // To handle surrogate pairs properly under unicode option, we will collect information on location of the pairs
  2661. // during pass 0, regardless if the option is present. (We aren't able to get it at that time)
  2662. // During pass 1, we will use that information to correctly create appropriate nodes.
  2663. // Body, pass 0
  2664. SetPosition(input, inputLim, true);
  2665. PatternPass0();
  2666. outBodyEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2667. outBodyChars = Pos();
  2668. // Options are needed for the next pass
  2669. ECMust('/', ERRnoSlash);
  2670. Options(flags);
  2671. this->unicodeFlagPresent = (flags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
  2672. this->caseInsensitiveFlagPresent = (flags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
  2673. Assert(!this->unicodeFlagPresent || scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  2674. // If this HR has been set, that means we have an earlier failure than the one caught above.
  2675. if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
  2676. {
  2677. throw ParseError(*deferredIfNotUnicodeError);
  2678. }
  2679. else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
  2680. {
  2681. throw ParseError(*deferredIfUnicodeError);
  2682. }
  2683. // Used below to proceed to the end of the regex
  2684. const EncodedChar *pastOptions = next;
  2685. this->currentSurrogatePairNode = this->surrogatePairList;
  2686. // Body, pass 1
  2687. SetPosition(input, inputLim, true);
  2688. Node* root = PatternPass1();
  2689. Assert(outBodyEncodedChars == Chars<EncodedChar>::OSB(next, input));
  2690. Assert(outBodyChars == Pos());
  2691. next = pastOptions;
  2692. outTotalEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2693. outTotalChars = Pos();
  2694. return root;
  2695. }
  2696. template <typename P, const bool IsLiteral>
  2697. void Parser<P, IsLiteral>::ParseLiteralNoAST
  2698. ( const EncodedChar* input
  2699. , const EncodedChar* inputLim
  2700. , CharCount& outBodyEncodedChars
  2701. , CharCount& outTotalEncodedChars
  2702. , CharCount& outBodyChars
  2703. , CharCount& outTotalChars )
  2704. {
  2705. Assert(IsLiteral);
  2706. Assert(input != 0);
  2707. Assert(inputLim >= input); // *inputLim need not be 0 because of deferred parsing
  2708. // Body, pass 0
  2709. SetPosition(input, inputLim, true);
  2710. PatternPass0();
  2711. outBodyEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2712. outBodyChars = Pos();
  2713. // Options
  2714. ECMust('/', ERRnoSlash);
  2715. RegexFlags dummyFlags = NoRegexFlags;
  2716. Options(dummyFlags);
  2717. this->unicodeFlagPresent = (dummyFlags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
  2718. this->caseInsensitiveFlagPresent = (dummyFlags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
  2719. outTotalEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2720. outTotalChars = Pos();
  2721. // If this HR has been set, that means we have an earlier failure than the one caught above.
  2722. if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
  2723. {
  2724. throw ParseError(*deferredIfNotUnicodeError);
  2725. }
  2726. else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
  2727. {
  2728. throw ParseError(*deferredIfUnicodeError);
  2729. }
  2730. }
  2731. template <typename P, const bool IsLiteral>
  2732. template <const bool buildAST>
  2733. RegexPattern * Parser<P, IsLiteral>::CompileProgram
  2734. ( Node* root,
  2735. const EncodedChar*& currentCharacter,
  2736. const CharCount totalLen,
  2737. const CharCount bodyChars,
  2738. const CharCount bodyEncodedChars,
  2739. const CharCount totalChars,
  2740. const RegexFlags flags )
  2741. {
  2742. Assert(IsLiteral);
  2743. Program* program = nullptr;
  2744. if (buildAST)
  2745. {
  2746. const auto recycler = this->scriptContext->GetRecycler();
  2747. program = Program::New(recycler, flags);
  2748. this->CaptureSourceAndGroups(recycler, program, currentCharacter, bodyChars, bodyEncodedChars);
  2749. }
  2750. currentCharacter += totalLen;
  2751. Assert(GetMultiUnits() == totalLen - totalChars);
  2752. if (!buildAST)
  2753. {
  2754. return nullptr;
  2755. }
  2756. RegexPattern* pattern = RegexPattern::New(this->scriptContext, program, true);
  2757. #if ENABLE_REGEX_CONFIG_OPTIONS
  2758. RegexStats* stats = 0;
  2759. if (REGEX_CONFIG_FLAG(RegexProfile))
  2760. {
  2761. stats = this->scriptContext->GetRegexStatsDatabase()->GetRegexStats(pattern);
  2762. this->scriptContext->GetRegexStatsDatabase()->EndProfile(stats, RegexStats::Parse);
  2763. }
  2764. if (REGEX_CONFIG_FLAG(RegexTracing))
  2765. {
  2766. DebugWriter* tw = this->scriptContext->GetRegexDebugWriter();
  2767. tw->Print(_u("// REGEX COMPILE "));
  2768. pattern->Print(tw);
  2769. tw->EOL();
  2770. }
  2771. if (REGEX_CONFIG_FLAG(RegexProfile))
  2772. this->scriptContext->GetRegexStatsDatabase()->BeginProfile();
  2773. #endif
  2774. ArenaAllocator* rtAllocator = this->scriptContext->RegexAllocator();
  2775. Compiler::Compile
  2776. ( this->scriptContext
  2777. , ctAllocator
  2778. , rtAllocator
  2779. , standardChars
  2780. , program
  2781. , root
  2782. , this->GetLitbuf()
  2783. , pattern
  2784. #if ENABLE_REGEX_CONFIG_OPTIONS
  2785. , w
  2786. , stats
  2787. #endif
  2788. );
  2789. #if ENABLE_REGEX_CONFIG_OPTIONS
  2790. if (REGEX_CONFIG_FLAG(RegexProfile))
  2791. this->scriptContext->GetRegexStatsDatabase()->EndProfile(stats, RegexStats::Compile);
  2792. #endif
  2793. #ifdef PROFILE_EXEC
  2794. this->scriptContext->ProfileEnd(Js::RegexCompilePhase);
  2795. #endif
  2796. // CaptureSourceAndGroups throws if this condition doesn't hold.
  2797. Assert(0 < pattern->NumGroups() && pattern->NumGroups() <= MAX_NUM_GROUPS);
  2798. return pattern;
  2799. }
  2800. template <typename P, const bool IsLiteral>
  2801. void Parser<P, IsLiteral>::CaptureEmptySourceAndNoGroups(Program* program)
  2802. {
  2803. Assert(program->source == 0);
  2804. program->source = const_cast<Char*>(_u(""));
  2805. program->sourceLen = 0;
  2806. program->numGroups = 1;
  2807. // Remaining to set during compilation: litbuf, litbufLen, numLoops, insts, instsLen, entryPointLabel
  2808. }
  2809. template <typename P, const bool IsLiteral>
  2810. void Parser<P, IsLiteral>::CaptureSourceAndGroups(Recycler* recycler, Program* program, const EncodedChar* body, CharCount bodyChars, CharCount bodyEncodedChars)
  2811. {
  2812. Assert(program->source == 0);
  2813. Assert(body != 0);
  2814. // Program will own source string
  2815. program->source = RecyclerNewArrayLeaf(recycler, Char, bodyChars + 1);
  2816. // Don't need to zero out since we're writing to the buffer right here
  2817. this->ConvertToUnicode(program->source, bodyChars, body, body + bodyEncodedChars);
  2818. program->source[bodyChars] = 0;
  2819. program->sourceLen = bodyChars;
  2820. // We expect nextGroupId to be positive, because the full regexp itself always
  2821. // counts as a capturing group.
  2822. Assert(nextGroupId > 0);
  2823. if (nextGroupId > MAX_NUM_GROUPS)
  2824. {
  2825. Js::JavascriptError::ThrowRangeError(this->scriptContext, JSERR_RegExpTooManyCapturingGroups);
  2826. }
  2827. else
  2828. {
  2829. program->numGroups = static_cast<uint16>(nextGroupId);
  2830. }
  2831. // Remaining to set during compilation: litbuf, litbufLen, numLoops, insts, instsLen, entryPointLabel
  2832. }
  2833. template <typename P, const bool IsLiteral>
  2834. Node* Parser<P, IsLiteral>::GetNodeWithValidCharacterSet(EncodedChar cc)
  2835. {
  2836. Node* nodeToReturn = nullptr;
  2837. if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() && this->unicodeFlagPresent)
  2838. {
  2839. MatchSetNode *lowerRangeNode = Anew(ctAllocator, MatchSetNode, false, false);
  2840. lowerRangeNode->set.SetRange(ctAllocator, (Char)0xD800, (Char)0xDBFF);
  2841. MatchSetNode *upperRangeNode = Anew(ctAllocator, MatchSetNode, false, false);
  2842. upperRangeNode->set.SetRange(ctAllocator, (Char)0xDC00, (Char)0xDFFF);
  2843. ConcatNode* surrogateRangePairNode = Anew(ctAllocator, ConcatNode, lowerRangeNode, Anew(ctAllocator, ConcatNode, upperRangeNode, nullptr));
  2844. // The MatchSet node will be split into [0-D7FFDC00-FFFF] (minus special characters like newline, whitespace, etc.) as a prefix, and a suffix of [D800-DBFF]
  2845. // i.e. The MatchSet node can be with [0-D7FFDC00-FFFF] (minus special characters like newline, whitespace, etc.) OR [D800-DBFF]
  2846. MatchSetNode* partialPrefixSetNode = Anew(ctAllocator, MatchSetNode, false, false);
  2847. switch (cc)
  2848. {
  2849. case '.':
  2850. standardChars->SetNonNewline(ctAllocator, partialPrefixSetNode->set);
  2851. break;
  2852. case 'S':
  2853. standardChars->SetNonWhitespace(ctAllocator, partialPrefixSetNode->set);
  2854. break;
  2855. case 'D':
  2856. standardChars->SetNonDigits(ctAllocator, partialPrefixSetNode->set);
  2857. break;
  2858. case 'W':
  2859. if (this->caseInsensitiveFlagPresent)
  2860. {
  2861. standardChars->SetNonWordIUChars(ctAllocator, partialPrefixSetNode->set);
  2862. }
  2863. else
  2864. {
  2865. standardChars->SetNonWordChars(ctAllocator, partialPrefixSetNode->set);
  2866. }
  2867. break;
  2868. default:
  2869. AssertMsg(false, "");
  2870. }
  2871. partialPrefixSetNode->set.SubtractRange(ctAllocator, (Char)0xD800u, (Char)0xDBFFu);
  2872. MatchSetNode* partialSuffixSetNode = Anew(ctAllocator, MatchSetNode, false, false);
  2873. partialSuffixSetNode->set.SetRange(ctAllocator, (Char)0xD800u, (Char)0xDBFFu);
  2874. AltNode* altNode = Anew(ctAllocator, AltNode, partialPrefixSetNode, Anew(ctAllocator, AltNode, surrogateRangePairNode, Anew(ctAllocator, AltNode, partialSuffixSetNode, nullptr)));
  2875. nodeToReturn = altNode;
  2876. }
  2877. else
  2878. {
  2879. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
  2880. switch (cc)
  2881. {
  2882. case '.':
  2883. standardChars->SetNonNewline(ctAllocator, setNode->set);
  2884. break;
  2885. case 'S':
  2886. standardChars->SetNonWhitespace(ctAllocator, setNode->set);
  2887. break;
  2888. case 'D':
  2889. standardChars->SetNonDigits(ctAllocator, setNode->set);
  2890. break;
  2891. case 'W':
  2892. standardChars->SetNonWordChars(ctAllocator, setNode->set);
  2893. break;
  2894. default:
  2895. AssertMsg(false, "");
  2896. }
  2897. nodeToReturn = setNode;
  2898. }
  2899. return nodeToReturn;
  2900. }
  2901. template <typename P, const bool IsLiteral>
  2902. void Parser<P, IsLiteral>::FreeBody()
  2903. {
  2904. if (litbuf != 0)
  2905. {
  2906. ctAllocator->Free(litbuf, litbufLen);
  2907. litbuf = nullptr;
  2908. litbufLen = 0;
  2909. litbufNext = 0;
  2910. }
  2911. }
  2912. // Instantiate all templates
  2913. #define INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, IsLiteral, BuildAST) \
  2914. template RegexPattern* Parser<EncodingPolicy, IsLiteral>::CompileProgram<BuildAST>(Node* root, const EncodedChar*& currentCharacter, const CharCount totalLen, const CharCount bodyChars, const CharCount bodyEncodedChars, const CharCount totalChars, const RegexFlags flags );
  2915. #define INSTANTIATE_REGEX_PARSER(EncodingPolicy) \
  2916. INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, false, false) \
  2917. INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, false, true) \
  2918. INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, true, false) \
  2919. INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, true, true) \
  2920. template class Parser<EncodingPolicy, false>; \
  2921. template class Parser<EncodingPolicy, true>;
  2922. // Instantiate the Parser
  2923. INSTANTIATE_REGEX_PARSER(NullTerminatedUnicodeEncodingPolicy);
  2924. INSTANTIATE_REGEX_PARSER(NotNullTerminatedUTF8EncodingPolicy);
  2925. }