RegexParser.cpp 128 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. /*
  6. --------------------------------------------------------------------------------------------------------------------------------
  7. Original
  8. --------------------------------------------------------------------------------------------------------------------------------
  9. Pattern ::= Disjunction
  10. Disjunction ::= Alternative | Alternative '|' Disjunction
  11. Alternative ::= [empty] | Alternative Term
  12. Term ::= Assertion | Atom | Atom Quantifier
  13. Assertion ::= '^' | '$' | '\' 'b' | '\' 'B' | '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
  14. Quantifier ::= QuantifierPrefix | QuantifierPrefix '?'
  15. QuantifierPrefix ::= '*' | '+' | '?' | '{' DecimalDigits '}' | '{' DecimalDigits ',' '}' | '{' DecimalDigits ',' DecimalDigits '}'
  16. Atom ::= PatternCharacter | '.' | '\' AtomEscape | CharacterClass | '(' Disjunction ')' | '(' '?' ':' Disjunction ')'
  17. PatternCharacter ::= SourceCharacter but not any of { '^', '$', '\', '.', '*', '+', '?', '(', ')', '[', ']', '{', '}', '|' }
  18. AtomEscape ::= DecimalEscape | CharacterEscape | CharacterClassEscape
  19. CharacterEscape ::= ControlEscape | 'c' ControlLetter | HexEscapeSequence | UnicodeEscapeSequence | IdentityEscape
  20. ControlEscape ::= one of { 'f', 'n', 'r', 't', 'v' }
  21. ControlLetter ::= one of { 'a'..'z', 'A'..'Z' }
  22. IdentityEscape ::= (SourceCharacter but not IdentifierPart) | <ZWJ> | <ZWNJ>
  23. DecimalEscape ::= DecimalIntegerLiteral [lookahead not in DecimalDigit]
  24. CharacterClassEscape :: one of { 'd', 'D', 's', 'S', 'w', 'W' }
  25. CharacterClass ::= '[' [lookahead not in {'^'}] ClassRanges ']' | '[' '^' ClassRanges ']'
  26. ClassRanges ::= [empty] | NonemptyClassRanges
  27. NonemptyClassRanges ::= ClassAtom | ClassAtom NonemptyClassRangesNoDash | ClassAtom '-' ClassAtom ClassRanges
  28. NonemptyClassRangesNoDash ::= ClassAtom | ClassAtomNoDash NonemptyClassRangesNoDash | ClassAtomNoDash '-' ClassAtom ClassRanges
  29. ClassAtom ::= '-' | ClassAtomNoDash
  30. ClassAtomNoDash ::= SourceCharacter but not one of { '\', ']', '-' } | '\' ClassEscape
  31. ClassEscape ::= DecimalEscape | 'b' | CharacterEscape | CharacterClassEscape
  32. SourceCharacter ::= <unicode character>
  33. HexEscapeSequence ::= 'x' HexDigit HexDigit
  34. UnicodeEscapeSequence ::= 'u' HexDigit HexDigit HexDigit HexDigit | 'u' '{' HexDigits '}'
  35. HexDigit ::= one of { '0'..'9', 'a'..'f', 'A'..'F' }
  36. IdentifierStart ::= UnicodeLetter | '$' | '_' | '\' UnicodeEscapeSequence
  37. IdentifierPart ::= IdentifierStart | UnicodeCombiningMark | UnicodeDigit | UnicodeConnectorPunctuation | <ZWNJ> | <ZWJ>
  38. UnicodeLetter ::= <unicode Uppercase letter> | <unicode Lowercase letter> | <unicode Titlecase letter> | <unicode Modifier letter> | <unicode Other letter> | <unicode Letter number>
  39. UnicodeCombiningMark = <unicode Non-spacing mark> | <unicode combining spacing mark>
  40. UnicodeDigit ::= <unicode Decimal number>
  41. UnicodeConnectorPunctuation ::= <unicode Connector punctuation>
  42. DecimalIntegerLiteral ::= '0' | NonZeroDigit DecimalDigits?
  43. DecimalDigits ::= DecimalDigit | DecimalDigits DecimalDigit
  44. DecimalDigit ::= one of { '0'..'9' }
  45. NonZeroDigit ::= one of { '1'..'9' }
  46. ------------------
  47. Annex B Deviations
  48. ------------------
  49. 1. The assertions (?= ) and (?! ) are treated as though they have a surrounding non-capture group, and hence can be quantified.
  50. Other assertions are not quantifiable.
  51. QuantifiableAssertion [added] ::= '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
  52. Assertion [replaced] ::= '^' | '$' | '\' 'b' | '\' 'B' | QuantifiableAssertion
  53. Term ::= ... | QuantifiableAssertion Quantifier [added]
  54. --------------------------------------------------------------------------------------------------------------------------------
  55. Left factored
  56. --------------------------------------------------------------------------------------------------------------------------------
  57. Pattern ::= Disjunction
  58. Disjunction ::= Alternative ('|' Alternative)*
  59. FOLLOW(Disjunction) = { <eof>, ')' }
  60. Alternative ::= Term*
  61. FOLLOW(Alternative) = { <eof>, ')', '|' }
  62. Term ::= ( '^' | '$' | '\' 'b' | '\' 'B' | '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')' | Atom Quantifier? )
  63. | ( PatternCharacter | '.' | '\' AtomEscape | CharacterClass | '(' Disjunction ')' | '(' '?' ':' Disjunction ')' )
  64. ( '*' | '+' | '?' | '{' DecimalDigits (',' DecimalDigits?)? '}' ) '?'?
  65. PatternCharacter ::= <unicode character> but not any of { '^', '$', '\', '.', '*', '+', '?', '(', ')', '[', ']', '{', '}', '|' }
  66. AtomEscape ::= DecimalEscape | CharacterEscape | CharacterClassEscape
  67. CharacterEscape ::= ControlEscape | 'c' ControlLetter | HexEscapeSequence | UnicodeEscapeSequence | IdentityEscape
  68. ControlEscape ::= one of { 'f', 'n', 'r', 't', 'v' }
  69. ControlLetter ::= one of { 'a'..'z', 'A'..'Z' }
  70. IdentityEscape ::= <unicode character> but not <unicode Uppercase letter>, <unicode Lowercase letter>, <unicode Titlecase letter>, <unicode Modifier letter>, <unicode Other letter>, <unicode Letter number>, '$', '_', <unicode Non-spacing mark>, <unicode combining spacing mark>, <unicode Decimal number>, <unicode Connector punctuation>
  71. DecimalEscape ::= DecimalIntegerLiteral [lookahead not in DecimalDigit]
  72. CharacterClassEscape :: one of { 'd', 'D', 's', 'S', 'w', 'W' }
  73. CharacterClass ::= '[' [lookahead not in {'^'}] ClassRanges ']' | '[' '^' ClassRanges ']'
  74. ClassRanges ::= [empty] | NonemptyClassRanges
  75. NonemptyClassRanges ::= ClassAtom | ClassAtom NonemptyClassRangesNoDash | ClassAtom '-' ClassAtom ClassRanges
  76. NonemptyClassRangesNoDash ::= ClassAtom | ClassAtomNoDash NonemptyClassRangesNoDash | ClassAtomNoDash '-' ClassAtom ClassRanges
  77. ClassAtom ::= '-' | ClassAtomNoDash
  78. ClassAtomNoDash ::= SourceCharacter but not one of { '\', ']', '-' } | '\' ClassEscape
  79. ClassEscape ::= DecimalEscape | 'b' | CharacterEscape | CharacterClassEscape
  80. HexEscapeSequence ::= 'x' HexDigit{2}
  81. UnicodeEscapeSequence ::= 'u' HexDigit{4} | 'u' '{' HexDigits{4-6} '}'
  82. HexDigit ::= one of { '0'..'9', 'a'..'f', 'A'..'F' }
  83. DecimalIntegerLiteral ::= '0' | NonZeroDigit DecimalDigits?
  84. DecimalDigits ::= DecimalDigit+
  85. DecimalDigit ::= one of { '0'..'9' }
  86. NonZeroDigit ::= one of { '1'..'9' }
  87. ------------------
  88. Annex B Deviations
  89. ------------------
  90. QuantifiableAssertion [added] ::= '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
  91. Term ::= ... | '(' '?' '=' Disjunction ')' [removed] | '(' '?' '!' Disjunction ')' [removed] | QuantifiableAssertion Quantifier? [added]
  92. */
  93. #include "ParserPch.h"
  94. namespace UnifiedRegex
  95. {
  96. template <typename P, const bool IsLiteral>
  97. const CharCount Parser<P, IsLiteral>::initLitbufSize;
  98. ParseError::ParseError(bool isBody, CharCount pos, CharCount encodedPos, HRESULT error)
  99. : isBody(isBody), pos(pos), encodedPos(encodedPos), error(error)
  100. {
  101. }
  102. template <typename P, const bool IsLiteral>
  103. Parser<P, IsLiteral>::Parser
  104. ( Js::ScriptContext* scriptContext
  105. , ArenaAllocator* ctAllocator
  106. , StandardChars<EncodedChar>* standardEncodedChars
  107. , StandardChars<Char>* standardChars
  108. , bool isUtf8
  109. #if ENABLE_REGEX_CONFIG_OPTIONS
  110. , DebugWriter* w
  111. #endif
  112. )
  113. : scriptContext(scriptContext)
  114. , ctAllocator(ctAllocator)
  115. , standardEncodedChars(standardEncodedChars)
  116. , standardChars(standardChars)
  117. #if ENABLE_REGEX_CONFIG_OPTIONS
  118. , w(w)
  119. #endif
  120. , input(0)
  121. , inputLim(0)
  122. , next(0)
  123. , inBody(false)
  124. , numGroups(1)
  125. , nextGroupId(1) // implicit overall group always takes index 0
  126. , litbuf(0)
  127. , litbufLen(0)
  128. , litbufNext(0)
  129. , surrogatePairList(nullptr)
  130. , currentSurrogatePairNode(nullptr)
  131. , tempLocationOfSurrogatePair(nullptr)
  132. , tempLocationOfRange(nullptr)
  133. , codePointAtTempLocation(0)
  134. , unicodeFlagPresent(false)
  135. , dotAllFlagPresent(false)
  136. , caseInsensitiveFlagPresent(false)
  137. , positionAfterLastSurrogate(nullptr)
  138. , valueOfLastSurrogate(INVALID_CODEPOINT)
  139. , deferredIfNotUnicodeError(nullptr)
  140. , deferredIfUnicodeError(nullptr)
  141. {
  142. this->SetIsUtf8(isUtf8);
  143. }
  144. //
  145. // Input buffer management
  146. //
  147. template <typename P, const bool IsLiteral>
  148. void Parser<P, IsLiteral>::SetPosition(const EncodedChar* input, const EncodedChar* inputLim, bool inBody)
  149. {
  150. this->input = input;
  151. this->inputLim = inputLim;
  152. next = input;
  153. this->inBody = inBody;
  154. this->RestoreMultiUnits(0);
  155. }
  156. template <typename P, const bool IsLiteral>
  157. inline CharCount Parser<P, IsLiteral>::Pos()
  158. {
  159. CharCount nextOffset = Chars<EncodedChar>::OSB(next, input);
  160. Assert(nextOffset >= this->m_cMultiUnits);
  161. return nextOffset - (CharCount) this->m_cMultiUnits;
  162. }
  163. template <typename P, const bool IsLiteral>
  164. inline bool Parser<P, IsLiteral>::IsEOF()
  165. {
  166. return next >= inputLim;
  167. }
  168. template <typename P, const bool IsLiteral>
  169. inline bool Parser<P, IsLiteral>::ECCanConsume(CharCount n /*= 1*/)
  170. {
  171. return next + n <= inputLim;
  172. }
  173. template <typename P, const bool IsLiteral>
  174. inline typename P::EncodedChar Parser<P, IsLiteral>::ECLookahead(CharCount n /*= 0*/)
  175. {
  176. // Ok to look ahead to terminating 0
  177. Assert(next + n <= inputLim);
  178. return next[n];
  179. }
  180. template <typename P, const bool IsLiteral>
  181. inline typename P::EncodedChar Parser<P, IsLiteral>::ECLookback(CharCount n /*= 0*/)
  182. {
  183. // Ok to look ahead to terminating 0
  184. Assert(n + input <= next);
  185. return *(next - n);
  186. }
  187. template <typename P, const bool IsLiteral>
  188. inline void Parser<P, IsLiteral>::ECConsume(CharCount n /*= 1*/)
  189. {
  190. Assert(next + n <= inputLim);
  191. #if DBG
  192. for (CharCount i = 0; i < n; i++)
  193. Assert(!this->IsMultiUnitChar(next[i]));
  194. #endif
  195. next += n;
  196. }
  197. template <typename P, const bool IsLiteral>
  198. inline void Parser<P, IsLiteral>::ECConsumeMultiUnit(CharCount n /*= 1*/)
  199. {
  200. Assert(next + n <= inputLim);
  201. next += n;
  202. }
  203. template <typename P, const bool IsLiteral>
  204. inline void Parser<P, IsLiteral>::ECRevert(CharCount n /*= 1*/)
  205. {
  206. Assert(n + input <= next);
  207. next -= n;
  208. }
  209. //
  210. // Helpers
  211. //
  212. template <typename P, const bool IsLiteral>
  213. int Parser<P, IsLiteral>::TryParseExtendedUnicodeEscape(Char& c, bool& previousSurrogatePart, bool trackSurrogatePair /* = false */)
  214. {
  215. if (!scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  216. {
  217. return 0;
  218. }
  219. if (!ECCanConsume(2) || ECLookahead(0) !='{' || !standardEncodedChars->IsHex(ECLookahead(1)))
  220. {
  221. return 0;
  222. }
  223. // The first character is mandatory to consume escape sequence, so we check for it above, at this stage we can set it as we already checked.
  224. codepoint_t codePoint = standardEncodedChars->DigitValue(ECLookahead(1));
  225. int i = 2;
  226. while(ECCanConsume(i + 1) && standardEncodedChars->IsHex(ECLookahead(i)))
  227. {
  228. codePoint <<= 4;
  229. codePoint += standardEncodedChars->DigitValue(ECLookahead(i));
  230. if (codePoint > 0x10FFFF)
  231. {
  232. DeferredFailIfUnicode(JSERR_RegExpInvalidEscape);
  233. return 0;
  234. }
  235. i++;
  236. }
  237. if(!ECCanConsume(i + 1) || ECLookahead(i) != '}')
  238. {
  239. return 0;
  240. }
  241. uint consumptionNumber = i + 1;
  242. Assert(consumptionNumber >= 3);
  243. if (!previousSurrogatePart && trackSurrogatePair && this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  244. {
  245. // Current location
  246. TrackIfSurrogatePair(codePoint, (next - 1), consumptionNumber + 1);
  247. }
  248. char16 other;
  249. // Generally if this code point is a single character, then we take it and return.
  250. // If the character is made up of two characters then we emit the first and backtrack to the start of th escape sequence;
  251. // Following that we check if we have already seen the first character, and if so emit the second and consume the entire escape sequence.
  252. if (codePoint < 0x10000)
  253. {
  254. c = UTC(codePoint);
  255. ECConsumeMultiUnit(consumptionNumber);
  256. }
  257. else if (previousSurrogatePart)
  258. {
  259. previousSurrogatePart = false;
  260. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &other, &c);
  261. ECConsumeMultiUnit(consumptionNumber);
  262. }
  263. else
  264. {
  265. previousSurrogatePart = true;
  266. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &c, &other);
  267. Assert(ECLookback(1) == 'u' && ECLookback(2) == '\\');
  268. ECRevert(2);
  269. }
  270. return consumptionNumber;
  271. }
  272. // This function has the following 'knowledge':
  273. // - A codepoint that might be part of a surrogate pair or part of one
  274. // - A location where that codepoint is located
  275. // - Previously tracked part of surrogate pair (tempLocationOfSurrogatePair, and codePointAtTempLocation), which is always a surrogate lower part.
  276. // - A pointer to the current location of the linked list
  277. // - If a previous location is tracked, then it is of a parsed character (not given character) before current, and not same to current
  278. // This can't be asserted directly, and has to be followed by callers. Term pass can reset with each iteration, as well as this method in cases it needs.
  279. template <typename P, const bool IsLiteral>
  280. void Parser<P, IsLiteral>:: TrackIfSurrogatePair(codepoint_t codePoint, const EncodedChar* location, uint32 consumptionLength)
  281. {
  282. Assert(codePoint < 0x110000);
  283. Assert(location != nullptr);
  284. Assert(location != this->tempLocationOfSurrogatePair);
  285. if (Js::NumberUtilities::IsSurrogateLowerPart(codePoint))
  286. {
  287. this->tempLocationOfSurrogatePair = location;
  288. this->codePointAtTempLocation = codePoint;
  289. }
  290. else
  291. {
  292. if(Js::NumberUtilities::IsSurrogateUpperPart(codePoint) && this->tempLocationOfSurrogatePair != nullptr)
  293. {
  294. Assert(Js::NumberUtilities::IsSurrogateLowerPart(codePointAtTempLocation));
  295. consumptionLength = (uint32)(location - this->tempLocationOfSurrogatePair) + consumptionLength;
  296. codePoint = Js::NumberUtilities::SurrogatePairAsCodePoint(codePointAtTempLocation, codePoint);
  297. location = this->tempLocationOfSurrogatePair;
  298. }
  299. // At this point we can clear previous location, and then if codePoint is bigger than 0xFFFF store it, as we either received it or combined it above
  300. this->tempLocationOfSurrogatePair = nullptr;
  301. this->codePointAtTempLocation = 0;
  302. }
  303. if (codePoint > 0xFFFF)
  304. {
  305. this->positionAfterLastSurrogate = location + consumptionLength;
  306. this->valueOfLastSurrogate = codePoint;
  307. // When parsing without AST we aren't given an allocator. In addition, only the 2 lines above are used during Pass 0;
  308. // while the bottom is used during Pass 1 (which isn't done when ParseNoAST)
  309. if(this->ctAllocator != nullptr)
  310. {
  311. SurrogatePairTracker* node = Anew(this->ctAllocator, SurrogatePairTracker, location, this->tempLocationOfRange, codePoint, consumptionLength, this->m_cMultiUnits);
  312. if (surrogatePairList == nullptr)
  313. {
  314. Assert(currentSurrogatePairNode == nullptr);
  315. surrogatePairList = node;
  316. currentSurrogatePairNode = node;
  317. }
  318. else
  319. {
  320. Assert(currentSurrogatePairNode != nullptr);
  321. currentSurrogatePairNode->next = node;
  322. currentSurrogatePairNode = node;
  323. }
  324. }
  325. }
  326. }
  327. template <typename P, const bool IsLiteral>
  328. Node* Parser<P, IsLiteral>::CreateSurrogatePairAtom(char16 lower, char16 upper)
  329. {
  330. MatchLiteralNode * literalNode = Anew(this->ctAllocator, MatchLiteralNode, 0, 0);
  331. MatchCharNode lowerNode(lower);
  332. MatchCharNode upperNode(upper);
  333. AccumLiteral(literalNode, &lowerNode);
  334. AccumLiteral(literalNode, &upperNode);
  335. return literalNode;
  336. }
  337. // This function will create appropriate pairs of ranges and add them to the disjunction node.
  338. // Terms used in comments:
  339. // - A minor codePoint is smaller than major codePoint, and both define a range of codePoints above 0x10000; to avoid confusion between lower/upper denoting codeUnits composing the surrogate pair.
  340. // - A boundary is a mod 0x400 alignment marker due to the nature of surrogate pairs representation. So the codepoint 0x10300 lies between boundaries 0x10000 and 0x10400.
  341. // - A prefix is the range set used to represent the values from minorCodePoint to the first boundary above minorCodePoint if applicable.
  342. // - A suffix is the range set used to represent the values from first boundary below majorCodePoint to the majorCodePoint if applicable.
  343. // - A full range is the range set used to represent the values from first boundary above minorCodePoint to first boundary below majorCodePoint if applicable.
  344. // The algorithm works as follows:
  345. // 1. Determine minorBoundary (minorCodePoint - mod 0x400 +0x400) and majorBoundary (majorCodePoint - mod 0x400). minorBoundary > minorCodePoint and majorBoundary < majorCodePoint
  346. // 2. Based on the codePoints and the boundaries, prefix, suffix, and full range is determined. Here are the rules:
  347. // 2-a. If minorBoundary > majorBoundary, we have an inner boundary range, output just that.
  348. // 2-b. If minorBoundary - 0x400u != minorCodepoint (i.e. codePoint doesn't lie right on a boundary to be part of a full range) we have a prefix.
  349. // 2-c. If majorBoundary + 0x3FFu != majorCodepoint (i.e. codePoint doesn't lie right before a boundary to be part of a full range) we have a suffix.
  350. // 2-d. We have a full range, if the two boundaries don't equal, OR the codePoints lie on the range boundaries opposite to what constitutes a prefix/suffix.
  351. // Visual representation for sample range 0x10300 - 0x10900
  352. // | [ _ | ^ ] |
  353. // 0x10000 0x10800 0x11000
  354. // [ ] - denote the actual range
  355. // _ - minorBoundary
  356. // ^ - majorBoundary
  357. // | - other boundaries
  358. // prefix is between [ and _
  359. // suffix is between ^ and ]
  360. // full range is between _ and ^
  361. template <typename P, const bool IsLiteral>
  362. AltNode* Parser<P, IsLiteral>::AppendSurrogateRangeToDisjunction(codepoint_t minorCodePoint, codepoint_t majorCodePoint, AltNode *lastAltNode)
  363. {
  364. Assert(minorCodePoint < majorCodePoint);
  365. Assert(minorCodePoint >= 0x10000u);
  366. Assert(majorCodePoint >= 0x10000u);
  367. Assert(scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() && unicodeFlagPresent);
  368. char16 lowerMinorCodeUnit, upperMinorCodeUnit, lowerMajorCodeUnit, upperMajorCodeUnit;
  369. Js::NumberUtilities::CodePointAsSurrogatePair(minorCodePoint, &lowerMinorCodeUnit, &upperMinorCodeUnit);
  370. Js::NumberUtilities::CodePointAsSurrogatePair(majorCodePoint, &lowerMajorCodeUnit, &upperMajorCodeUnit);
  371. // These boundaries represent whole range boundaries, as in 0x10000, 0x10400, 0x10800 etc
  372. // minor boundary is the first boundary strictly above minorCodePoint
  373. // major boundary is the first boundary below or equal to majorCodePoint
  374. codepoint_t minorBoundary = minorCodePoint - (minorCodePoint % 0x400u) + 0x400u;
  375. codepoint_t majorBoundary = majorCodePoint - (majorCodePoint % 0x400u);
  376. Assert(minorBoundary >= 0x10000);
  377. Assert(majorBoundary >= 0x10000);
  378. AltNode* tailToAdd = nullptr;
  379. // If the minor boundary is higher than major boundary, that means we have a range within the boundary and is less than 0x400
  380. // Ex: 0x10430 - 0x10700 will have minor boundary of 0x10800 and major of 0x10400
  381. // This pair will be represented in single range set.
  382. const bool singleRange = minorBoundary > majorBoundary;
  383. if (singleRange)
  384. {
  385. Assert(majorCodePoint - minorCodePoint < 0x400u);
  386. Assert(lowerMinorCodeUnit == lowerMajorCodeUnit);
  387. MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, lowerMinorCodeUnit);
  388. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
  389. setNode->set.SetRange(ctAllocator, (Char)upperMinorCodeUnit, (Char)upperMajorCodeUnit);
  390. ConcatNode* concatNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, setNode, nullptr));
  391. tailToAdd = Anew(ctAllocator, AltNode, concatNode, nullptr);
  392. }
  393. else
  394. {
  395. Node* prefixNode = nullptr, *suffixNode = nullptr;
  396. const bool twoConsecutiveRanges = minorBoundary == majorBoundary;
  397. // For minorBoundary,
  398. if (minorBoundary - minorCodePoint == 1) // Single character in minor range
  399. {
  400. // The prefix is only a surrogate pair atom
  401. prefixNode = CreateSurrogatePairAtom(lowerMinorCodeUnit, upperMinorCodeUnit);
  402. }
  403. else if (minorCodePoint != minorBoundary - 0x400u) // Minor range isn't full
  404. {
  405. Assert(minorBoundary - minorCodePoint < 0x400u);
  406. MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, (Char)lowerMinorCodeUnit);
  407. MatchSetNode* upperSetNode = Anew(ctAllocator, MatchSetNode, false);
  408. upperSetNode->set.SetRange(ctAllocator, (Char)upperMinorCodeUnit, (Char)0xDFFFu);
  409. prefixNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, upperSetNode, nullptr));
  410. }
  411. else // Full minor range
  412. {
  413. minorBoundary -= 0x400u;
  414. }
  415. if (majorBoundary == majorCodePoint) // Single character in major range
  416. {
  417. // The suffix is only a surrogate pair atom
  418. suffixNode = CreateSurrogatePairAtom(lowerMajorCodeUnit, upperMajorCodeUnit);
  419. majorBoundary -= 0x400u;
  420. }
  421. else if (majorBoundary + 0x3FFu != majorCodePoint) // Major range isn't full
  422. {
  423. Assert(majorCodePoint - majorBoundary < 0x3FFu);
  424. MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, (Char)lowerMajorCodeUnit);
  425. MatchSetNode* upperSetNode = Anew(ctAllocator, MatchSetNode, false, false);
  426. upperSetNode->set.SetRange(ctAllocator, (Char)0xDC00u, (Char)upperMajorCodeUnit);
  427. suffixNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, upperSetNode, nullptr));
  428. majorBoundary -= 0x400u;
  429. }
  430. const bool nonFullConsecutiveRanges = twoConsecutiveRanges && prefixNode != nullptr && suffixNode != nullptr;
  431. if (nonFullConsecutiveRanges)
  432. {
  433. Assert(suffixNode != nullptr);
  434. Assert(minorCodePoint != minorBoundary - 0x400u);
  435. Assert(majorBoundary + 0x3FFu != majorCodePoint);
  436. // If the minor boundary is equal to major boundary, that means we have a cross boundary range that only needs 2 nodes for prefix/suffix.
  437. // We can only cross one boundary.
  438. Assert(majorCodePoint - minorCodePoint < 0x800u);
  439. tailToAdd = Anew(ctAllocator, AltNode, prefixNode, Anew(ctAllocator, AltNode, suffixNode, nullptr));
  440. }
  441. else
  442. {
  443. // We have 3 sets of ranges, comprising of prefix, full and suffix.
  444. Assert(majorCodePoint - minorCodePoint >= 0x400u);
  445. Assert((prefixNode != nullptr && suffixNode != nullptr) // Spanning more than two ranges
  446. || (prefixNode == nullptr && minorBoundary == minorCodePoint) // Two consecutive ranges and the minor is full
  447. || (suffixNode == nullptr && majorBoundary + 0x3FFu == majorCodePoint)); // Two consecutive ranges and the major is full
  448. Node* lowerOfFullRange;
  449. char16 lowerMinorBoundary, lowerMajorBoundary, ignore;
  450. Js::NumberUtilities::CodePointAsSurrogatePair(minorBoundary, &lowerMinorBoundary, &ignore);
  451. bool singleFullRange = majorBoundary == minorBoundary;
  452. if (singleFullRange)
  453. {
  454. // The lower part of the full range is simple a surrogate lower char
  455. lowerOfFullRange = Anew(ctAllocator, MatchCharNode, (Char)lowerMinorBoundary);
  456. }
  457. else
  458. {
  459. Js::NumberUtilities::CodePointAsSurrogatePair(majorBoundary, &lowerMajorBoundary, &ignore);
  460. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
  461. setNode->set.SetRange(ctAllocator, (Char)lowerMinorBoundary, (Char)lowerMajorBoundary);
  462. lowerOfFullRange = setNode;
  463. }
  464. MatchSetNode* fullUpperRange = Anew(ctAllocator, MatchSetNode, false, false);
  465. fullUpperRange->set.SetRange(ctAllocator, (Char)0xDC00u, (Char)0xDFFFu);
  466. // These are added in the following order [full] [prefix][suffix]
  467. // This is doing by prepending, so in reverse.
  468. if (suffixNode != nullptr)
  469. {
  470. tailToAdd = Anew(ctAllocator, AltNode, suffixNode, tailToAdd);
  471. }
  472. if (prefixNode != nullptr)
  473. {
  474. tailToAdd = Anew(ctAllocator, AltNode, prefixNode, tailToAdd);
  475. }
  476. tailToAdd = Anew(ctAllocator, AltNode, Anew(ctAllocator, ConcatNode, lowerOfFullRange, Anew(ctAllocator, ConcatNode, fullUpperRange, nullptr)), tailToAdd);
  477. }
  478. }
  479. if (lastAltNode != nullptr)
  480. {
  481. Assert(lastAltNode->tail == nullptr);
  482. lastAltNode->tail = tailToAdd;
  483. }
  484. return tailToAdd;
  485. }
  486. template <typename P, const bool IsLiteral>
  487. AltNode* Parser<P, IsLiteral>::AppendSurrogatePairToDisjunction(codepoint_t codePoint, AltNode *lastAltNode)
  488. {
  489. char16 lower, upper;
  490. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &lower, &upper);
  491. AltNode* tailNode = Anew(ctAllocator, AltNode, CreateSurrogatePairAtom(lower, upper), nullptr);
  492. if (lastAltNode != nullptr)
  493. {
  494. lastAltNode->tail = tailNode;
  495. }
  496. return tailNode;
  497. }
  498. //
  499. // Errors
  500. //
  501. template <typename P, const bool IsLiteral>
  502. void Parser<P, IsLiteral>::Fail(HRESULT error)
  503. {
  504. throw ParseError(inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
  505. }
  506. // This doesn't throw, but stores first error code for throwing later
  507. template <typename P, const bool IsLiteral>
  508. void Parser<P, IsLiteral>::DeferredFailIfUnicode(HRESULT error)
  509. {
  510. Assert(this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  511. if (this->deferredIfUnicodeError == nullptr)
  512. {
  513. this->deferredIfUnicodeError = Anew(ctAllocator, ParseError, inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
  514. }
  515. }
  516. // This doesn't throw, but stores first error code for throwing later
  517. template <typename P, const bool IsLiteral>
  518. void Parser<P, IsLiteral>::DeferredFailIfNotUnicode(HRESULT error)
  519. {
  520. Assert(this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  521. if (this->deferredIfNotUnicodeError == nullptr)
  522. {
  523. this->deferredIfNotUnicodeError = Anew(ctAllocator, ParseError, inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
  524. }
  525. }
  526. template <typename P, const bool IsLiteral>
  527. inline void Parser<P, IsLiteral>::ECMust(EncodedChar ec, HRESULT error)
  528. {
  529. // We never look for 0
  530. Assert(ec != 0);
  531. if (ECLookahead() != ec)
  532. Fail(error);
  533. ECConsume();
  534. }
  535. template <typename P, const bool IsLiteral>
  536. inline char16 Parser<P, IsLiteral>::NextChar()
  537. {
  538. Assert(!IsEOF());
  539. // Could be an embedded 0
  540. Char c = this->template ReadFull<true>(next, inputLim);
  541. // No embedded newlines in literals
  542. if (IsLiteral && standardChars->IsNewline(c))
  543. Fail(ERRnoSlash);
  544. return c;
  545. }
  546. //
  547. // Patterns/Disjunctions/Alternatives
  548. //
  549. template <typename P, const bool IsLiteral>
  550. void Parser<P, IsLiteral>::PatternPass0()
  551. {
  552. this->positionAfterLastSurrogate = nullptr;
  553. this->deferredIfNotUnicodeError = nullptr;
  554. this->deferredIfUnicodeError = nullptr;
  555. DisjunctionPass0(0);
  556. }
  557. template <typename P, const bool IsLiteral>
  558. Node* Parser<P, IsLiteral>::PatternPass1()
  559. {
  560. return DisjunctionPass1();
  561. }
  562. template <typename P, const bool IsLiteral>
  563. Node* Parser<P, IsLiteral>::UnionNodes(Node* prev, Node* curr)
  564. {
  565. if (prev->tag == Node::MatchChar)
  566. {
  567. MatchCharNode* prevChar = (MatchCharNode*)prev;
  568. if (curr->tag == Node::MatchChar)
  569. {
  570. MatchCharNode* currChar = (MatchCharNode*)curr;
  571. if (prevChar->cs[0] == currChar->cs[0])
  572. // Just ignore current node
  573. return prevChar;
  574. else
  575. {
  576. // Union chars into new set
  577. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false);
  578. setNode->set.Set(ctAllocator, prevChar->cs[0]);
  579. setNode->set.Set(ctAllocator, currChar->cs[0]);
  580. return setNode;
  581. }
  582. }
  583. else if (curr->tag == Node::MatchSet)
  584. {
  585. MatchSetNode* currSet = (MatchSetNode*)curr;
  586. if (currSet->isNegation)
  587. // Can't merge
  588. return 0;
  589. else
  590. {
  591. // Union chars into new set
  592. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false);
  593. setNode->set.Set(ctAllocator, prevChar->cs[0]) ;
  594. setNode->set.UnionInPlace(ctAllocator, currSet->set);
  595. return setNode;
  596. }
  597. }
  598. else
  599. // Can't merge
  600. return 0;
  601. }
  602. else if (prev->tag == Node::MatchSet)
  603. {
  604. MatchSetNode* prevSet = (MatchSetNode*)prev;
  605. if (prevSet->isNegation)
  606. // Can't merge
  607. return 0;
  608. else if (curr->tag == Node::MatchChar)
  609. {
  610. MatchCharNode* currChar = (MatchCharNode*)curr;
  611. // Include char in prev set
  612. prevSet->set.Set(ctAllocator, currChar->cs[0]);
  613. return prevSet;
  614. }
  615. else if (curr->tag == Node::MatchSet)
  616. {
  617. MatchSetNode* currSet = (MatchSetNode*)curr;
  618. if (currSet->isNegation)
  619. // Can't merge
  620. return 0;
  621. else
  622. {
  623. // Include chars in prev set
  624. prevSet->set.UnionInPlace(ctAllocator, currSet->set);
  625. return prevSet;
  626. }
  627. }
  628. else
  629. // Can't merge
  630. return 0;
  631. }
  632. else
  633. // Can't merge
  634. return 0;
  635. }
  636. template <typename P, const bool IsLiteral>
  637. void Parser<P, IsLiteral>::DisjunctionPass0(int depth)
  638. {
  639. AlternativePass0(depth);
  640. while (true)
  641. {
  642. // Could be terminating 0
  643. if (ECLookahead() != '|')
  644. return;
  645. ECConsume();
  646. AlternativePass0(depth);
  647. }
  648. }
  649. template <typename P, const bool IsLiteral>
  650. Node* Parser<P, IsLiteral>::DisjunctionPass1()
  651. {
  652. // Maintain the invariants:
  653. // - alt lists have two or more items
  654. // - alt list items are never alt lists (so we must inline them)
  655. // - an alt list never contains two consecutive match-character/match-set nodes
  656. // (so we must union consecutive items into a single set)
  657. Node* node = AlternativePass1();
  658. AltNode* last = 0;
  659. // First node may be an alternative
  660. if (node->tag == Node::Alt)
  661. {
  662. last = (AltNode*)node;
  663. while (last->tail != 0)
  664. last = last->tail;
  665. }
  666. while (true)
  667. {
  668. // Could be terminating 0
  669. if (ECLookahead() != '|')
  670. return node;
  671. ECConsume(); // '|'
  672. Node* next = AlternativePass1();
  673. AnalysisAssert(next != nullptr);
  674. Node* revisedPrev = UnionNodes(last == 0 ? node : last->head, next);
  675. if (revisedPrev != 0)
  676. {
  677. // Can merge next into previously seen alternative
  678. if (last == 0)
  679. node = revisedPrev;
  680. else
  681. last->head = revisedPrev;
  682. }
  683. else if (next->tag == Node::Alt)
  684. {
  685. AltNode* nextList = (AltNode*)next;
  686. // Since we just had to dereference next to get here, we know that nextList
  687. // can't be nullptr in this case.
  688. AnalysisAssert(nextList != nullptr);
  689. // Append inner list to current list
  690. revisedPrev = UnionNodes(last == 0 ? node : last->head, nextList->head);
  691. if (revisedPrev != 0)
  692. {
  693. // Can merge head of list into previously seen alternative
  694. if (last ==0)
  695. node = revisedPrev;
  696. else
  697. last->head = revisedPrev;
  698. nextList = nextList->tail;
  699. }
  700. if (last == 0)
  701. node = Anew(ctAllocator, AltNode, node, nextList);
  702. else
  703. last->tail = nextList;
  704. AnalysisAssert(nextList != nullptr);
  705. while (nextList->tail != 0)
  706. nextList = nextList->tail;
  707. last = nextList;
  708. }
  709. else
  710. {
  711. // Append node
  712. AltNode* cons = Anew(ctAllocator, AltNode, next, 0);
  713. if (last == 0)
  714. node = Anew(ctAllocator, AltNode, node, cons);
  715. else
  716. last->tail = cons;
  717. last = cons;
  718. }
  719. }
  720. }
  721. template <typename P, const bool IsLiteral>
  722. inline bool Parser<P, IsLiteral>::IsEndOfAlternative()
  723. {
  724. EncodedChar ec = ECLookahead();
  725. // Could be terminating 0, but embedded 0 is part of alternative
  726. return (ec == 0 && IsEOF()) || ec == ')' || ec == '|' || (IsLiteral && ec == '/');
  727. }
  728. template <typename P, const bool IsLiteral>
  729. void Parser<P, IsLiteral>::EnsureLitbuf(CharCount size)
  730. {
  731. if (litbufLen - litbufNext < size)
  732. {
  733. CharCount newLen = max(litbufLen, initLitbufSize);
  734. while (newLen < litbufNext + size)
  735. newLen *= 2;
  736. litbuf = (Char*)ctAllocator->Realloc(litbuf, litbufLen * sizeof(Char), newLen * sizeof(Char));
  737. litbufLen = newLen;
  738. }
  739. }
  740. template <typename P, const bool IsLiteral>
  741. void Parser<P, IsLiteral>::AccumLiteral(MatchLiteralNode* deferredLiteralNode, Node* charOrLiteralNode)
  742. {
  743. Assert(charOrLiteralNode->tag == Node::MatchChar || charOrLiteralNode->tag == Node::MatchLiteral);
  744. CharCount addLen = charOrLiteralNode->LiteralLength();
  745. Assert(addLen > 0);
  746. if (deferredLiteralNode->length == 0)
  747. {
  748. // Start a new literal
  749. EnsureLitbuf(addLen);
  750. deferredLiteralNode->offset = litbufNext;
  751. deferredLiteralNode->length = addLen;
  752. charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
  753. }
  754. else if (deferredLiteralNode->offset + deferredLiteralNode->length == litbufNext)
  755. {
  756. // Keep growing the current literal
  757. EnsureLitbuf(addLen);
  758. charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
  759. deferredLiteralNode->length += addLen;
  760. }
  761. else if (charOrLiteralNode->tag == Node::MatchLiteral && deferredLiteralNode->offset + deferredLiteralNode->length == ((MatchLiteralNode*)charOrLiteralNode)->offset)
  762. {
  763. // Absorb next literal into current literal since they are adjacent
  764. deferredLiteralNode->length += addLen;
  765. }
  766. else
  767. {
  768. // Abandon current literal and start a fresh one (leaves gap)
  769. EnsureLitbuf(deferredLiteralNode->length + addLen);
  770. js_wmemcpy_s(litbuf + litbufNext, litbufLen - litbufNext, litbuf + deferredLiteralNode->offset, deferredLiteralNode->length);
  771. deferredLiteralNode->offset = litbufNext;
  772. litbufNext += deferredLiteralNode->length;
  773. charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
  774. deferredLiteralNode->length += addLen;
  775. }
  776. }
  777. template <typename P, const bool IsLiteral>
  778. Node* Parser<P, IsLiteral>::FinalTerm(Node* node, MatchLiteralNode* deferredLiteralNode)
  779. {
  780. if (node == deferredLiteralNode)
  781. {
  782. #if DBG
  783. if (deferredLiteralNode->length == 0)
  784. Assert(false);
  785. #endif
  786. Assert(deferredLiteralNode->offset < litbufNext);
  787. Assert(deferredLiteralNode->offset + deferredLiteralNode->length <= litbufNext);
  788. if (deferredLiteralNode->length == 1)
  789. {
  790. node = Anew(ctAllocator, MatchCharNode, litbuf[deferredLiteralNode->offset]);
  791. if (deferredLiteralNode->offset + deferredLiteralNode->length == litbufNext)
  792. // Reclaim last added character
  793. litbufNext--;
  794. // else: leave a gap in the literal buffer
  795. }
  796. else
  797. node = Anew(ctAllocator, MatchLiteralNode, *deferredLiteralNode);
  798. deferredLiteralNode->offset = 0;
  799. deferredLiteralNode->length = 0;
  800. }
  801. return node;
  802. }
  803. template <typename P, const bool IsLiteral>
  804. void Parser<P, IsLiteral>::AlternativePass0(int depth)
  805. {
  806. while (!IsEndOfAlternative())
  807. TermPass0(depth);
  808. }
  809. template <typename P, const bool IsLiteral>
  810. Node* Parser<P, IsLiteral>::AlternativePass1()
  811. {
  812. if (IsEndOfAlternative())
  813. return Anew(ctAllocator, SimpleNode, Node::Empty);
  814. MatchCharNode deferredCharNode(0);
  815. MatchLiteralNode deferredLiteralNode(0, 0);
  816. // Maintain the invariants:
  817. // - concat lists have two or more items
  818. // - concat list items are never concat lists
  819. // - a concat list never contains two consecutive match-character/match-literal nodes
  820. bool previousSurrogatePart = false;
  821. Node* node = TermPass1(&deferredCharNode, previousSurrogatePart);
  822. AnalysisAssert(node != nullptr);
  823. ConcatNode* last = 0;
  824. // First node may be a concat
  825. if (node->tag == Node::Concat)
  826. {
  827. last = (ConcatNode*)node;
  828. while (last->tail != 0)
  829. last = last->tail;
  830. }
  831. if (last == 0)
  832. {
  833. if (node->LiteralLength() > 0)
  834. {
  835. // Begin a new literal
  836. AccumLiteral(&deferredLiteralNode, node);
  837. node = &deferredLiteralNode;
  838. }
  839. }
  840. else
  841. {
  842. if (last->head->LiteralLength() > 0)
  843. {
  844. // Begin a new literal
  845. AccumLiteral(&deferredLiteralNode, last->head);
  846. last->head = &deferredLiteralNode;
  847. }
  848. }
  849. while (!IsEndOfAlternative())
  850. {
  851. Node* next = TermPass1(&deferredCharNode, previousSurrogatePart);
  852. AnalysisAssert(next != nullptr);
  853. if (next->LiteralLength() > 0)
  854. {
  855. // Begin a new literal or grow the existing literal
  856. AccumLiteral(&deferredLiteralNode, next);
  857. if (last == 0)
  858. {
  859. if (node != &deferredLiteralNode)
  860. {
  861. // So far we have first item and the current literal
  862. ConcatNode* cons = Anew(ctAllocator, ConcatNode, &deferredLiteralNode, 0);
  863. node = Anew(ctAllocator, ConcatNode, node, cons);
  864. last = cons;
  865. }
  866. // else: keep growing first literal
  867. }
  868. else
  869. {
  870. if (last->head != &deferredLiteralNode)
  871. {
  872. // Append a new literal node
  873. ConcatNode* cons = Anew(ctAllocator, ConcatNode, &deferredLiteralNode, 0);
  874. last->tail = cons;
  875. last = cons;
  876. }
  877. // else: keep growing current literal
  878. }
  879. }
  880. else if (next->tag == Node::Concat)
  881. {
  882. // Append this list to accumulated list
  883. ConcatNode* nextList = (ConcatNode*)next;
  884. if (nextList->head->LiteralLength() > 0 &&
  885. ((last == 0 && node == &deferredLiteralNode) ||
  886. (last != 0 && last->head == &deferredLiteralNode)))
  887. {
  888. // Absorb the next character or literal into the current literal
  889. // (may leave a gab in litbuf)
  890. AccumLiteral(&deferredLiteralNode, nextList->head);
  891. nextList = nextList->tail;
  892. // List has at least two items
  893. AnalysisAssert(nextList != 0);
  894. // List should be in canonical form, so no consecutive chars/literals
  895. Assert(nextList->head->LiteralLength() == 0);
  896. }
  897. if (last == 0)
  898. node = Anew(ctAllocator, ConcatNode, FinalTerm(node, &deferredLiteralNode), nextList);
  899. else
  900. {
  901. last->head = FinalTerm(last->head, &deferredLiteralNode);
  902. last->tail = nextList;
  903. }
  904. // NextList can't be nullptr, since it was last set as next, which
  905. // was dereferenced, or it was set on a path that already has this
  906. // analysis assert.
  907. AnalysisAssert(nextList != nullptr);
  908. while (nextList->tail != 0)
  909. nextList = nextList->tail;
  910. last = nextList;
  911. // No outstanding literals
  912. Assert(deferredLiteralNode.length == 0);
  913. // We just set this from nextList, which we know is not nullptr.
  914. AnalysisAssert(last != nullptr);
  915. if (last->head->LiteralLength() > 0)
  916. {
  917. // If the list ends with a literal, transfer it into deferredLiteralNode
  918. // so we can continue accumulating (won't leave a gab in litbuf)
  919. AccumLiteral(&deferredLiteralNode, last->head);
  920. // Can discard MatchLiteralNode since it lives in compile-time allocator
  921. last->head = &deferredLiteralNode;
  922. }
  923. }
  924. else
  925. {
  926. // Append this node to accumulated list
  927. ConcatNode* cons = Anew(ctAllocator, ConcatNode, next, 0);
  928. if (last == 0)
  929. node = Anew(ctAllocator, ConcatNode, FinalTerm(node, &deferredLiteralNode), cons);
  930. else
  931. {
  932. last->head = FinalTerm(last->head, &deferredLiteralNode);
  933. last->tail = cons;
  934. }
  935. last = cons;
  936. // No outstanding literals
  937. Assert(deferredLiteralNode.length == 0);
  938. }
  939. }
  940. if (last == 0)
  941. node = FinalTerm(node, &deferredLiteralNode);
  942. else
  943. last->head = FinalTerm(last->head, &deferredLiteralNode);
  944. // No outstanding literals
  945. Assert(deferredLiteralNode.length == 0);
  946. return node;
  947. }
  948. //
  949. // Terms
  950. //
  951. template <typename P, const bool IsLiteral>
  952. Node* Parser<P, IsLiteral>::NewLoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body)
  953. {
  954. //
  955. // NOTE: We'd like to represent r? (i.e. r{0,1}) as r|<empty> since the loop representation has high overhead.
  956. // HOWEVER if r contains a group definition and could match empty then we must execute as a loop
  957. // so that group bindings are correctly reset on no progress (e.g.: /(a*)?/.exec("")). Thus we defer
  958. // this optimization until pass 4 of the optimizer, at which point we know whether r could match empty.
  959. //
  960. if (lower == 1 && upper == 1)
  961. return body;
  962. else if (lower == 0 && upper == 0)
  963. // Loop is equivalent to empty. If the loop body contains group definitions they will have already been
  964. // counted towards the overall number of groups. The matcher will initialize their contents to
  965. // undefined, and since the loop body would never execute the inner groups could never be updated from
  966. // undefined.
  967. return Anew(ctAllocator, SimpleNode, Node::Empty);
  968. else
  969. return Anew(ctAllocator, LoopNode, lower, upper, isGreedy, body);
  970. }
  971. template <typename P, const bool IsLiteral>
  972. bool Parser<P, IsLiteral>::AtQuantifier()
  973. {
  974. // Could be terminating 0
  975. switch (ECLookahead())
  976. {
  977. case '*':
  978. case '+':
  979. case '?':
  980. return true;
  981. case '{':
  982. {
  983. CharCount lookahead = 1;
  984. while (ECCanConsume(lookahead + 1) && standardEncodedChars->IsDigit(ECLookahead(lookahead)))
  985. lookahead++;
  986. if (lookahead == 1 || !ECCanConsume(lookahead + 1))
  987. return false;
  988. switch (ECLookahead(lookahead))
  989. {
  990. case ',':
  991. lookahead++;
  992. if (ECCanConsume(lookahead + 1) && ECLookahead(lookahead) == '}')
  993. return true;
  994. else
  995. {
  996. CharCount saved = lookahead;
  997. while (ECCanConsume(lookahead + 1) && standardEncodedChars->IsDigit(ECLookahead(lookahead)))
  998. lookahead++;
  999. if (lookahead == saved)
  1000. return false;
  1001. return ECCanConsume(lookahead + 1) && ECLookahead(lookahead) == '}';
  1002. }
  1003. case '}':
  1004. return true;
  1005. default:
  1006. return false;
  1007. }
  1008. }
  1009. default:
  1010. return false;
  1011. }
  1012. }
  1013. template <typename P, const bool IsLiteral>
  1014. bool Parser<P, IsLiteral>::OptNonGreedy()
  1015. {
  1016. // Could be terminating 0
  1017. if (ECLookahead() != '?')
  1018. return true;
  1019. ECConsume();
  1020. return false;
  1021. }
  1022. template <typename P, const bool IsLiteral>
  1023. CharCount Parser<P, IsLiteral>::RepeatCount()
  1024. {
  1025. CharCount n = 0;
  1026. int digits = 0;
  1027. while (true)
  1028. {
  1029. // Could be terminating 0
  1030. EncodedChar ec = ECLookahead();
  1031. if (!standardEncodedChars->IsDigit(ec))
  1032. {
  1033. if (digits == 0)
  1034. {
  1035. Fail(JSERR_RegExpSyntax);
  1036. }
  1037. return n;
  1038. }
  1039. if (n > MaxCharCount / 10)
  1040. {
  1041. break;
  1042. }
  1043. n *= 10;
  1044. if (n > MaxCharCount - standardEncodedChars->DigitValue(ec))
  1045. {
  1046. break;
  1047. }
  1048. n += standardEncodedChars->DigitValue(ec);
  1049. digits++;
  1050. ECConsume();
  1051. }
  1052. Assert(digits != 0); // shouldn't be able to reach here with (digits == 0)
  1053. // The token had a value larger than MaxCharCount so we didn't return the value and reached here instead.
  1054. // Consume the rest of the token and return MaxCharCount.
  1055. while (true)
  1056. {
  1057. EncodedChar ec = ECLookahead();
  1058. if (!standardEncodedChars->IsDigit(ec))
  1059. {
  1060. return MaxCharCount;
  1061. }
  1062. ECConsume();
  1063. }
  1064. }
  1065. template <typename P, const bool IsLiteral>
  1066. void Parser<P, IsLiteral>::TermPass0(int depth)
  1067. {
  1068. PROBE_STACK_NO_DISPOSE(scriptContext, Js::Constants::MinStackRegex);
  1069. // Either we have a location at the start, or the end, never both. As in between it should have been cleared if surrogate pair
  1070. // Or must be cleared if we didn't perform the check
  1071. bool clearLocationIfPresent = this->tempLocationOfSurrogatePair != nullptr;
  1072. switch (ECLookahead())
  1073. {
  1074. case '^':
  1075. case '$':
  1076. ECConsume();
  1077. return;
  1078. case '\\':
  1079. ECConsume();
  1080. if (AtomEscapePass0())
  1081. return;
  1082. break;
  1083. case '(':
  1084. // Can't combine into a single codeunit because of group present
  1085. this->tempLocationOfSurrogatePair = nullptr;
  1086. this->codePointAtTempLocation = 0;
  1087. clearLocationIfPresent = false;
  1088. ECConsume();
  1089. switch (ECLookahead())
  1090. {
  1091. case '?':
  1092. if (!ECCanConsume(2))
  1093. Fail(JSERR_RegExpSyntax);
  1094. switch (ECLookahead(1))
  1095. {
  1096. case '=':
  1097. case '!':
  1098. case ':':
  1099. ECConsume(2);
  1100. break;
  1101. default:
  1102. numGroups++;
  1103. break;
  1104. }
  1105. break;
  1106. default:
  1107. numGroups++;
  1108. break;
  1109. }
  1110. DisjunctionPass0(depth + 1);
  1111. ECMust(')', JSERR_RegExpNoParen);
  1112. break;
  1113. case '.':
  1114. ECConsume();
  1115. break;
  1116. case '[':
  1117. ECConsume();
  1118. this->tempLocationOfSurrogatePair = nullptr;
  1119. this->codePointAtTempLocation = 0;
  1120. this->tempLocationOfRange = next;
  1121. CharacterClassPass0();
  1122. this->tempLocationOfRange = nullptr;
  1123. ECMust(']', JSERR_RegExpNoBracket);
  1124. break;
  1125. case ')':
  1126. case '|':
  1127. Fail(JSERR_RegExpSyntax);
  1128. break;
  1129. case ']':
  1130. case '}':
  1131. NextChar();
  1132. break;
  1133. case '*':
  1134. case '+':
  1135. case '?':
  1136. case '{':
  1137. if (AtQuantifier())
  1138. Fail(JSERR_RegExpBadQuant);
  1139. else
  1140. ECConsume();
  1141. break;
  1142. case 0:
  1143. if (IsEOF())
  1144. // Terminating 0
  1145. Fail(JSERR_RegExpSyntax);
  1146. // else fall-through for embedded 0
  1147. default:
  1148. {
  1149. const EncodedChar* current = next;
  1150. Char c = NextChar();
  1151. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1152. {
  1153. TrackIfSurrogatePair(c, current, (uint32)(next - current));
  1154. }
  1155. // Closing '/' in literals should be caught explicitly
  1156. Assert(!IsLiteral || c != '/');
  1157. }
  1158. break;
  1159. }
  1160. if (clearLocationIfPresent && this->tempLocationOfSurrogatePair != nullptr)
  1161. {
  1162. this->tempLocationOfSurrogatePair = nullptr;
  1163. this->codePointAtTempLocation = 0;
  1164. }
  1165. if (AtQuantifier())
  1166. {
  1167. switch (ECLookahead())
  1168. {
  1169. case '*':
  1170. case '+':
  1171. case '?':
  1172. ECConsume();
  1173. OptNonGreedy();
  1174. break;
  1175. case '{':
  1176. {
  1177. ECConsume();
  1178. CharCount lower = RepeatCount();
  1179. switch (ECLookahead())
  1180. {
  1181. case ',':
  1182. ECConsume();
  1183. if (ECLookahead() == '}')
  1184. {
  1185. ECConsume();
  1186. OptNonGreedy();
  1187. }
  1188. else
  1189. {
  1190. CharCount upper = RepeatCount();
  1191. if (upper < lower)
  1192. Fail(JSERR_RegExpSyntax);
  1193. Assert(ECLookahead() == '}');
  1194. ECConsume();
  1195. OptNonGreedy();
  1196. }
  1197. break;
  1198. case '}':
  1199. ECConsume();
  1200. OptNonGreedy();
  1201. break;
  1202. default:
  1203. Assert(false);
  1204. break;
  1205. }
  1206. break;
  1207. }
  1208. default:
  1209. Assert(false);
  1210. break;
  1211. }
  1212. }
  1213. }
  1214. template <typename P, const bool IsLiteral>
  1215. Node* Parser<P, IsLiteral>::TermPass1(MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
  1216. {
  1217. PROBE_STACK_NO_DISPOSE(scriptContext, Js::Constants::MinStackRegex);
  1218. Node* node = 0;
  1219. bool containsSurrogatePair = false;
  1220. switch (ECLookahead())
  1221. {
  1222. case '^':
  1223. ECConsume();
  1224. return Anew(ctAllocator, SimpleNode, Node::BOL); // No quantifier allowed
  1225. case '$':
  1226. ECConsume();
  1227. return Anew(ctAllocator, SimpleNode, Node::EOL); // No quantifier allowed
  1228. case '\\':
  1229. ECConsume();
  1230. if(SurrogatePairPass1(node, deferredCharNode, previousSurrogatePart))
  1231. {
  1232. break; // For quantifier
  1233. }
  1234. else if (AtomEscapePass1(node, deferredCharNode, previousSurrogatePart))
  1235. {
  1236. return node; // No quantifier allowed
  1237. }
  1238. break; // else: fall-through for opt quantifier
  1239. case '(':
  1240. ECConsume();
  1241. switch (ECLookahead())
  1242. {
  1243. case '?':
  1244. switch (ECLookahead(1))
  1245. {
  1246. case '=':
  1247. ECConsume(2); // ?=
  1248. node = DisjunctionPass1();
  1249. Assert(ECLookahead() == ')');
  1250. ECConsume(); // )
  1251. node = Anew(ctAllocator, AssertionNode, false, node);
  1252. break; // As per Annex B, allow this to be quantifiable
  1253. case '!':
  1254. ECConsume(2); // ?!
  1255. node = DisjunctionPass1();
  1256. Assert(ECLookahead() == ')');
  1257. ECConsume(); // )
  1258. node = Anew(ctAllocator, AssertionNode, true, node);
  1259. break; // As per Annex B, allow this to be quantifiable
  1260. case ':':
  1261. ECConsume(2); // ?:
  1262. node = DisjunctionPass1();
  1263. Assert(ECLookahead() == ')');
  1264. ECConsume(); // )
  1265. break; // fall-through for opt quantifier
  1266. default:
  1267. {
  1268. // ? not yet consumed
  1269. int thisGroupId = nextGroupId++;
  1270. node = DisjunctionPass1();
  1271. Assert(ECLookahead() == ')');
  1272. ECConsume(); // )
  1273. node = Anew(ctAllocator, DefineGroupNode, thisGroupId, node);
  1274. break; // fall-through for opt quantifier
  1275. }
  1276. }
  1277. break;
  1278. default:
  1279. {
  1280. // next char not yet consumed
  1281. int thisGroupId = nextGroupId++;
  1282. node = DisjunctionPass1();
  1283. Assert(ECLookahead() == ')');
  1284. ECConsume(); // )
  1285. node = Anew(ctAllocator, DefineGroupNode, thisGroupId, node);
  1286. break; // fall-through for opt quantifier
  1287. }
  1288. }
  1289. break;
  1290. case '.':
  1291. {
  1292. ECConsume();
  1293. node = GetNodeWithValidCharacterSet('.');
  1294. break; // fall-through for opt quantifier
  1295. }
  1296. case '[':
  1297. ECConsume();
  1298. if (unicodeFlagPresent)
  1299. {
  1300. containsSurrogatePair = this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->rangeLocation == next;
  1301. }
  1302. node = containsSurrogatePair ? CharacterClassPass1<true>() : CharacterClassPass1<false>();
  1303. Assert(ECLookahead() == ']');
  1304. ECConsume(); // ]
  1305. break; // fall-through for opt quantifier
  1306. #if DBG
  1307. case ')':
  1308. case '|':
  1309. Assert(false);
  1310. break;
  1311. #endif
  1312. case ']':
  1313. // SPEC DEVIATION: This should be syntax error, instead accept as itself
  1314. deferredCharNode->cs[0] = NextChar();
  1315. node = deferredCharNode;
  1316. break; // fall-through for opt quantifier
  1317. #if DBG
  1318. case '*':
  1319. case '+':
  1320. case '?':
  1321. AssertMsg(false, "Allowed only in the escaped form. These should be caught by TermPass0.");
  1322. break;
  1323. #endif
  1324. case 0:
  1325. if (IsEOF())
  1326. // Terminating 0
  1327. Fail(JSERR_RegExpSyntax);
  1328. // else fall-through for embedded 0
  1329. default:
  1330. if(SurrogatePairPass1(node, deferredCharNode, previousSurrogatePart))
  1331. {
  1332. break; //For quantifier
  1333. }
  1334. else
  1335. {
  1336. deferredCharNode->cs[0] = NextChar();
  1337. node = deferredCharNode;
  1338. break; // fall-through for opt quantifier
  1339. }
  1340. }
  1341. Assert(node != 0);
  1342. if (AtQuantifier())
  1343. {
  1344. switch (ECLookahead())
  1345. {
  1346. case '*':
  1347. if (node == deferredCharNode)
  1348. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1349. ECConsume();
  1350. return NewLoopNode(0, CharCountFlag, OptNonGreedy(), node);
  1351. case '+':
  1352. if (node == deferredCharNode)
  1353. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1354. ECConsume();
  1355. return NewLoopNode(1, CharCountFlag, OptNonGreedy(), node);
  1356. case '?':
  1357. if (node == deferredCharNode)
  1358. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1359. ECConsume();
  1360. return NewLoopNode(0, 1, OptNonGreedy(), node);
  1361. case '{':
  1362. {
  1363. if (node == deferredCharNode)
  1364. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1365. ECConsume();
  1366. CharCount lower = RepeatCount();
  1367. switch (ECLookahead())
  1368. {
  1369. case ',':
  1370. ECConsume();
  1371. if (ECLookahead() == '}')
  1372. {
  1373. ECConsume();
  1374. return NewLoopNode(lower, CharCountFlag, OptNonGreedy(), node);
  1375. }
  1376. else
  1377. {
  1378. CharCount upper = RepeatCount();
  1379. Assert(lower <= upper);
  1380. Assert(ECLookahead() == '}');
  1381. ECConsume(); // }
  1382. return NewLoopNode(lower, upper, OptNonGreedy(), node);
  1383. }
  1384. case '}':
  1385. ECConsume();
  1386. return NewLoopNode(lower, lower, OptNonGreedy(), node);
  1387. default:
  1388. Assert(false);
  1389. break;
  1390. }
  1391. break;
  1392. }
  1393. default:
  1394. Assert(false);
  1395. break;
  1396. }
  1397. }
  1398. return node;
  1399. }
  1400. #pragma warning(push)
  1401. #pragma warning(disable:4702) // unreachable code
  1402. template <typename P, const bool IsLiteral>
  1403. bool Parser<P, IsLiteral>::AtomEscapePass0()
  1404. {
  1405. EncodedChar ec = ECLookahead();
  1406. if (ec == 0 && IsEOF())
  1407. {
  1408. // Terminating 0
  1409. Fail(JSERR_RegExpSyntax);
  1410. return false;
  1411. }
  1412. else if (standardEncodedChars->IsDigit(ec))
  1413. {
  1414. do
  1415. {
  1416. ECConsume();
  1417. }
  1418. while (standardEncodedChars->IsDigit(ECLookahead())); // terminating 0 is not a digit
  1419. return false;
  1420. }
  1421. else if (ECLookahead() == 'c')
  1422. {
  1423. if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
  1424. {
  1425. ECConsume(2);
  1426. }
  1427. else
  1428. {
  1429. DeferredFailIfUnicode(JSERR_RegExpInvalidEscape);
  1430. }
  1431. return false;
  1432. }
  1433. else
  1434. {
  1435. const EncodedChar *current = next;
  1436. // An escaped '/' is ok
  1437. Char c = NextChar();
  1438. switch (c)
  1439. {
  1440. case 'b':
  1441. case 'B':
  1442. return true;
  1443. // case 'c': handled as special case above
  1444. case 'x':
  1445. if (ECCanConsume(2) &&
  1446. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1447. standardEncodedChars->IsHex(ECLookahead(1)))
  1448. ECConsume(2);
  1449. break;
  1450. case 'u':
  1451. bool surrogateEncountered = false;
  1452. int lengthOfSurrogate = TryParseExtendedUnicodeEscape(c, surrogateEncountered, true);
  1453. if (lengthOfSurrogate > 0)
  1454. {
  1455. if (surrogateEncountered)
  1456. {
  1457. // If we don't have an allocator, we don't create nodes
  1458. // Asserts in place as extra checks for when we do have an allocator
  1459. Assert(this->ctAllocator == nullptr || this->currentSurrogatePairNode != nullptr);
  1460. Assert(this->ctAllocator == nullptr || current == this->currentSurrogatePairNode->location);
  1461. ECConsume(lengthOfSurrogate);
  1462. }
  1463. //Don't fall through
  1464. break;
  1465. }
  1466. else if (ECCanConsume(4) &&
  1467. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1468. standardEncodedChars->IsHex(ECLookahead(1)) &&
  1469. standardEncodedChars->IsHex(ECLookahead(2)) &&
  1470. standardEncodedChars->IsHex(ECLookahead(3)))
  1471. {
  1472. if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1473. {
  1474. codepoint_t value = (standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  1475. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  1476. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  1477. (standardEncodedChars->DigitValue(ECLookahead(3)));
  1478. TrackIfSurrogatePair(value, (next - 1), 5);
  1479. }
  1480. ECConsume(4);
  1481. }
  1482. break;
  1483. }
  1484. // embedded 0 is ok
  1485. return false;
  1486. }
  1487. }
  1488. #pragma warning(pop)
  1489. template <typename P, const bool IsLiteral>
  1490. bool Parser<P, IsLiteral>::AtomEscapePass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
  1491. {
  1492. Assert(!IsEOF()); // checked for terminating 0 in pass 0
  1493. if (standardEncodedChars->IsDigit(ECLookahead()))
  1494. {
  1495. // As per Annex B, allow octal escapes as well as group references, disambiguate based on known
  1496. // number of groups.
  1497. if (ECLookahead() == '0')
  1498. {
  1499. // fall through for octal
  1500. }
  1501. else
  1502. {
  1503. // Could be a group reference, but only if between 1 and 5 digits which resolve to a valid group number
  1504. int n = 0;
  1505. CharCount digits = 0;
  1506. do
  1507. {
  1508. n = n * 10 + (int)standardEncodedChars->DigitValue(ECLookahead(digits));
  1509. digits++;
  1510. }
  1511. while (digits < 5 && ECCanConsume(digits + 1) && standardEncodedChars->IsDigit(ECLookahead(digits)));
  1512. if (n >= numGroups ||
  1513. (ECCanConsume(digits + 1) && standardEncodedChars->IsDigit(ECLookahead(digits))))
  1514. {
  1515. if (standardEncodedChars->IsOctal(ECLookahead()))
  1516. {
  1517. // fall through for octal
  1518. }
  1519. else
  1520. {
  1521. // \8 and \9 are identity escapes
  1522. deferredCharNode->cs[0] = Chars<EncodedChar>::CTW(ECLookahead());
  1523. ECConsume();
  1524. node = deferredCharNode;
  1525. return false; // not an assertion
  1526. }
  1527. }
  1528. else
  1529. {
  1530. ECConsume(digits);
  1531. node = Anew(ctAllocator, MatchGroupNode, n);
  1532. return false; // not an assertion
  1533. }
  1534. }
  1535. // Must be between 1 and 3 octal digits
  1536. Assert(standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not an octal
  1537. uint n = 0;
  1538. CharCount digits = 0;
  1539. do
  1540. {
  1541. uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
  1542. if (m > Chars<uint8>::MaxUChar) // Regex octal codes only support single byte (ASCII) characters.
  1543. break;
  1544. n = m;
  1545. ECConsume();
  1546. digits++;
  1547. }
  1548. while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not an octal
  1549. deferredCharNode->cs[0] = UTC((UChar)n);
  1550. node = deferredCharNode;
  1551. return false; // not an assertion
  1552. }
  1553. else if (ECLookahead() == 'c')
  1554. {
  1555. Char c;
  1556. if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
  1557. {
  1558. c = UTC(Chars<EncodedChar>::CTU(ECLookahead(1)) % 32);
  1559. ECConsume(2);
  1560. }
  1561. else
  1562. {
  1563. // SPEC DEVIATION: For non-letters or EOF, take the leading '\' to be itself, and
  1564. // don't consume the 'c' or letter.
  1565. c = '\\';
  1566. }
  1567. deferredCharNode->cs[0] = c;
  1568. node = deferredCharNode;
  1569. return false; // not an assertion
  1570. }
  1571. else
  1572. {
  1573. Char c = NextChar();
  1574. switch (c)
  1575. {
  1576. case 'b':
  1577. node = Anew(ctAllocator, WordBoundaryNode, false);
  1578. return true; // Is an assertion
  1579. case 'B':
  1580. node = Anew(ctAllocator, WordBoundaryNode, true);
  1581. return true; // Is an assertion
  1582. case 'f':
  1583. c = '\f';
  1584. break; // fall-through for identity escape
  1585. case 'n':
  1586. c = '\n';
  1587. break; // fall-through for identity escape
  1588. case 'r':
  1589. c = '\r';
  1590. break; // fall-through for identity escape
  1591. case 't':
  1592. c = '\t';
  1593. break; // fall-through for identity escape
  1594. case 'v':
  1595. c = '\v';
  1596. break; // fall-through for identity escape
  1597. case 'd':
  1598. {
  1599. MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
  1600. standardChars->SetDigits(ctAllocator, setNode->set);
  1601. node = setNode;
  1602. return false; // not an assertion
  1603. }
  1604. case 'D':
  1605. {
  1606. node = GetNodeWithValidCharacterSet('D');
  1607. return false; // not an assertion
  1608. }
  1609. case 's':
  1610. {
  1611. MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
  1612. standardChars->SetWhitespace(ctAllocator, setNode->set);
  1613. node = setNode;
  1614. return false; // not an assertion
  1615. }
  1616. case 'S':
  1617. {
  1618. node = GetNodeWithValidCharacterSet('S');
  1619. return false; // not an assertion
  1620. }
  1621. case 'w':
  1622. {
  1623. MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
  1624. if (this->unicodeFlagPresent && this->caseInsensitiveFlagPresent)
  1625. {
  1626. standardChars->SetWordIUChars(ctAllocator, setNode->set);
  1627. }
  1628. else
  1629. {
  1630. standardChars->SetWordChars(ctAllocator, setNode->set);
  1631. }
  1632. node = setNode;
  1633. return false; // not an assertion
  1634. }
  1635. case 'W':
  1636. {
  1637. node = GetNodeWithValidCharacterSet('W');
  1638. return false; // not an assertion
  1639. }
  1640. // case 'c': handled as special case above
  1641. case 'x':
  1642. if (ECCanConsume(2) &&
  1643. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1644. standardEncodedChars->IsHex(ECLookahead(1)))
  1645. {
  1646. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
  1647. (standardEncodedChars->DigitValue(ECLookahead(1))));
  1648. ECConsume(2);
  1649. // fall-through for identity escape
  1650. }
  1651. // Take to be identity escape if ill-formed as per Annex B
  1652. break;
  1653. case 'u':
  1654. if (unicodeFlagPresent && TryParseExtendedUnicodeEscape(c, previousSurrogatePart) > 0)
  1655. break;
  1656. else if (ECCanConsume(4) &&
  1657. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1658. standardEncodedChars->IsHex(ECLookahead(1)) &&
  1659. standardEncodedChars->IsHex(ECLookahead(2)) &&
  1660. standardEncodedChars->IsHex(ECLookahead(3)))
  1661. {
  1662. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  1663. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  1664. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  1665. (standardEncodedChars->DigitValue(ECLookahead(3))));
  1666. ECConsume(4);
  1667. // fall-through for identity escape
  1668. }
  1669. // Take to be identity escape if ill-formed as per Annex B
  1670. break;
  1671. case '^':
  1672. case '$':
  1673. case '\\':
  1674. case '.':
  1675. case '*':
  1676. case '+':
  1677. case '?':
  1678. case '(':
  1679. case ')':
  1680. case '[':
  1681. case ']':
  1682. case '{':
  1683. case '}':
  1684. case '|':
  1685. case '/':
  1686. break; // fall-through for identity escape
  1687. default:
  1688. if (this->unicodeFlagPresent)
  1689. {
  1690. // As per #sec-forbidden-extensions, if unicode flag is present, we must disallow any other escape.
  1691. this->Fail(JSERR_RegExpInvalidEscape); // throw SyntaxError
  1692. }
  1693. // As per Annex B, allow anything other than newlines and above. Embedded 0 is ok
  1694. break;
  1695. }
  1696. // Must be an identity escape
  1697. deferredCharNode->cs[0] = c;
  1698. node = deferredCharNode;
  1699. return false; // not an assertion
  1700. }
  1701. }
  1702. template <typename P, const bool IsLiteral>
  1703. bool Parser<P, IsLiteral>::SurrogatePairPass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
  1704. {
  1705. if (!this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() || !unicodeFlagPresent)
  1706. {
  1707. return false;
  1708. }
  1709. if (this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
  1710. {
  1711. AssertMsg(!this->currentSurrogatePairNode->IsInsideRange(), "Should not be calling this pass if we are currently inside a range.");
  1712. char16 lower, upper;
  1713. uint tableIndex = 0, actualHigh = 0;
  1714. codepoint_t equivClass[CaseInsensitive::EquivClassSize];
  1715. if (caseInsensitiveFlagPresent && CaseInsensitive::RangeToEquivClass(tableIndex, this->currentSurrogatePairNode->value, this->currentSurrogatePairNode->value, actualHigh, equivClass))
  1716. {
  1717. Node *equivNode[CaseInsensitive::EquivClassSize];
  1718. int indexForNextNode = 0;
  1719. for (int i = 0; i < CaseInsensitive::EquivClassSize; i++)
  1720. {
  1721. bool alreadyAdded = false;
  1722. for (int j = 0; j < i; j++)
  1723. {
  1724. if (equivClass[i] == equivClass[j])
  1725. {
  1726. alreadyAdded = true;
  1727. break;
  1728. }
  1729. }
  1730. if (!alreadyAdded)
  1731. {
  1732. if (Js::NumberUtilities::IsInSupplementaryPlane(equivClass[i]))
  1733. {
  1734. Js::NumberUtilities::CodePointAsSurrogatePair(equivClass[i], &lower, &upper);
  1735. equivNode[indexForNextNode] = CreateSurrogatePairAtom(lower, upper);
  1736. }
  1737. else
  1738. {
  1739. equivNode[indexForNextNode] = Anew(ctAllocator, MatchCharNode, (Char)equivClass[i]);
  1740. }
  1741. indexForNextNode ++;
  1742. }
  1743. }
  1744. Assert(indexForNextNode > 0);
  1745. if (indexForNextNode == 1)
  1746. {
  1747. node = equivNode[0];
  1748. }
  1749. else
  1750. {
  1751. AltNode *altNode = Anew(ctAllocator, AltNode, equivNode[0], nullptr);
  1752. AltNode *altNodeTail = altNode;
  1753. for (int i = 1; i < indexForNextNode; i++)
  1754. {
  1755. altNodeTail->tail = Anew(ctAllocator, AltNode, equivNode[i], nullptr);
  1756. altNodeTail = altNodeTail->tail;
  1757. }
  1758. node = altNode;
  1759. }
  1760. }
  1761. else
  1762. {
  1763. Js::NumberUtilities::CodePointAsSurrogatePair(this->currentSurrogatePairNode->value, &lower, &upper);
  1764. node = CreateSurrogatePairAtom(lower, upper);
  1765. }
  1766. previousSurrogatePart = false;
  1767. Assert(ECCanConsume(this->currentSurrogatePairNode->length));
  1768. ECConsumeMultiUnit(this->currentSurrogatePairNode->length);
  1769. this->RestoreMultiUnits(this->currentSurrogatePairNode->multiUnits);
  1770. this->currentSurrogatePairNode = this->currentSurrogatePairNode->next;
  1771. return true;
  1772. }
  1773. return false;
  1774. }
  1775. //
  1776. // Classes
  1777. //
  1778. template <typename P, const bool IsLiteral>
  1779. bool Parser<P, IsLiteral>::AtSecondSingletonClassAtom()
  1780. {
  1781. Assert(ECLookahead() == '-');
  1782. if (ECLookahead(1) == '\\')
  1783. {
  1784. switch (ECLookahead(2))
  1785. {
  1786. case 'd':
  1787. case 'D':
  1788. case 's':
  1789. case 'S':
  1790. case 'w':
  1791. case 'W':
  1792. // These all denote non-singleton sets
  1793. return false;
  1794. default:
  1795. // fall-through for singleton
  1796. break;
  1797. }
  1798. }
  1799. return true;
  1800. }
  1801. template <typename P, const bool IsLiteral>
  1802. void Parser<P, IsLiteral>::CharacterClassPass0()
  1803. {
  1804. // Could be terminating 0
  1805. if (ECLookahead() == '^')
  1806. ECConsume();
  1807. EncodedChar nextChar = ECLookahead();
  1808. const EncodedChar* current;
  1809. codepoint_t lastCodepoint = INVALID_CODEPOINT;
  1810. codepoint_t pendingRangeStart = INVALID_CODEPOINT;
  1811. codepoint_t pendingRangeEnd = INVALID_CODEPOINT;
  1812. bool previousSurrogatePart = false;
  1813. while(nextChar != ']')
  1814. {
  1815. current = next;
  1816. if (nextChar == '\0' && IsEOF())
  1817. {
  1818. // Report as unclosed '['
  1819. Fail(JSERR_RegExpNoBracket);
  1820. return;
  1821. } // Otherwise embedded '\0' is ok
  1822. else if (nextChar == '\\')
  1823. {
  1824. // Consume, as classescapepass0 expects for it to be consumed
  1825. Char outChar = NextChar();
  1826. // If previousSurrogatePart = true upon leaving this method, then we are going to pass through here twice
  1827. // This is because \u{} escape sequence was encountered that is actually 2 characters, the second time we will pass consuming entire character
  1828. if (ClassEscapePass0(outChar, previousSurrogatePart))
  1829. {
  1830. lastCodepoint = outChar;
  1831. }
  1832. else
  1833. {
  1834. // Last codepoint isn't a singleton, so no codepoint tracking for the sake of ranges is needed.
  1835. lastCodepoint = INVALID_CODEPOINT;
  1836. // Unless we have a possible range end, cancel our range tracking.
  1837. if (pendingRangeEnd == INVALID_CODEPOINT)
  1838. {
  1839. pendingRangeStart = INVALID_CODEPOINT;
  1840. }
  1841. }
  1842. }
  1843. else if (nextChar == '-')
  1844. {
  1845. if (pendingRangeStart != INVALID_CODEPOINT || lastCodepoint == INVALID_CODEPOINT)
  1846. {
  1847. // '-' is the upper part of the range, with pendingRangeStart codepoint is the lower. Set lastCodePoint to '-' to check at the end of the while statement
  1848. // OR
  1849. // '-' is just a char, consume it and set as last char
  1850. lastCodepoint = NextChar();
  1851. }
  1852. else
  1853. {
  1854. pendingRangeStart = this->next == this->positionAfterLastSurrogate
  1855. ? this->valueOfLastSurrogate
  1856. : lastCodepoint;
  1857. lastCodepoint = INVALID_CODEPOINT;
  1858. NextChar();
  1859. }
  1860. // If we have a pattern of the form [\ud800-\udfff], we need this to be interpreted as a range.
  1861. // In order to achieve this, the two variables that we use to track surrogate pairs, namely
  1862. // tempLocationOfSurrogatePair and previousSurrogatePart, need to be in a certain state.
  1863. //
  1864. // We need to reset tempLocationOfSurrogatePair as it points to the first Unicode escape (\ud800)
  1865. // when we're here. We need to clear it in order not to have a surrogate pair when we process the
  1866. // second escape (\udfff).
  1867. //
  1868. // previousSurrogatePart is used when we have a code point in the \u{...} extended format and the
  1869. // character is in a supplementary plane. However, there is no need to change its value here. When
  1870. // such an escape sequence is encountered, the first call to ClassEscapePass0() sets the variable
  1871. // to true, but it rewinds the input back to the beginning of the escape sequence. The next
  1872. // iteration of the loop here will again call ClassEscape0() with the same character and the
  1873. // variable will this time be set to false. Therefore, the variable will always be false here.
  1874. tempLocationOfSurrogatePair = nullptr;
  1875. Assert(!previousSurrogatePart);
  1876. }
  1877. else
  1878. {
  1879. lastCodepoint = NextChar();
  1880. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1881. {
  1882. TrackIfSurrogatePair(lastCodepoint, current, (uint32)(next - current));
  1883. }
  1884. }
  1885. if (!scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1886. {
  1887. // This is much easier to handle.
  1888. if (pendingRangeStart != INVALID_CODEPOINT && lastCodepoint != INVALID_CODEPOINT)
  1889. {
  1890. Assert(pendingRangeStart < 0x10000);
  1891. Assert(pendingRangeEnd == INVALID_CODEPOINT);
  1892. if (pendingRangeStart > lastCodepoint)
  1893. {
  1894. Fail(JSERR_RegExpBadRange);
  1895. }
  1896. pendingRangeStart = lastCodepoint = INVALID_CODEPOINT;
  1897. }
  1898. }
  1899. // We have a candidate for range end, and we have a range start.
  1900. else if (pendingRangeStart != INVALID_CODEPOINT && (lastCodepoint != INVALID_CODEPOINT || pendingRangeEnd != INVALID_CODEPOINT))
  1901. {
  1902. // The following will be true at the end of each surrogate pair parse.
  1903. // Note, the escape sequence \u{} is a two time parse, so this will be true on the second time around.
  1904. if (this->next == this->positionAfterLastSurrogate)
  1905. {
  1906. lastCodepoint = this->valueOfLastSurrogate;
  1907. Assert(!previousSurrogatePart);
  1908. pendingRangeEnd = lastCodepoint;
  1909. lastCodepoint = INVALID_CODEPOINT;
  1910. }
  1911. // If the next character is the end of range ']', then we can't have a surrogate pair.
  1912. // The current character is the range end, if we don't already have a candidate.
  1913. else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)
  1914. {
  1915. pendingRangeEnd = lastCodepoint;
  1916. }
  1917. //If we get here, and pendingRangeEnd is set. Then one of the above has caused it to be set, or the previous iteration of the loop.
  1918. if (pendingRangeEnd != INVALID_CODEPOINT)
  1919. {
  1920. char16 leftSingleChar, rightSingleChar, ignore;
  1921. if (pendingRangeStart >= 0x10000)
  1922. {
  1923. Js::NumberUtilities::CodePointAsSurrogatePair(pendingRangeStart, &ignore, &leftSingleChar);
  1924. }
  1925. else
  1926. {
  1927. leftSingleChar = (char16)pendingRangeStart;
  1928. }
  1929. if (pendingRangeEnd >= 0x10000)
  1930. {
  1931. Js::NumberUtilities::CodePointAsSurrogatePair(pendingRangeEnd, &rightSingleChar, &ignore);
  1932. }
  1933. else
  1934. {
  1935. rightSingleChar = (char16)pendingRangeEnd;
  1936. }
  1937. // Here it is a bit tricky, we don't know if we have a unicode option specified.
  1938. // If it is, then \ud800\udc00 - \ud800\udc01 is valid, otherwise invalid.
  1939. if (pendingRangeStart < 0x10000 && pendingRangeEnd < 0x10000 && pendingRangeStart > pendingRangeEnd)
  1940. {
  1941. Fail(JSERR_RegExpBadRange);
  1942. }
  1943. else
  1944. {
  1945. if(leftSingleChar > rightSingleChar)
  1946. {
  1947. DeferredFailIfNotUnicode(JSERR_RegExpBadRange);
  1948. }
  1949. if (pendingRangeStart > pendingRangeEnd)
  1950. {
  1951. DeferredFailIfUnicode(JSERR_RegExpBadRange);
  1952. }
  1953. }
  1954. pendingRangeStart = pendingRangeEnd = INVALID_CODEPOINT;
  1955. }
  1956. // The current char < 0x10000 is a candidate for the range end, but we need to iterate one more time.
  1957. else
  1958. {
  1959. pendingRangeEnd = lastCodepoint;
  1960. }
  1961. }
  1962. nextChar = ECLookahead();
  1963. }
  1964. // We should never have a pendingRangeEnd set when we exit the loop
  1965. Assert(pendingRangeEnd == INVALID_CODEPOINT);
  1966. }
  1967. template <typename P, const bool IsLiteral>
  1968. template <bool containsSurrogates>
  1969. Node* Parser<P, IsLiteral>::CharacterClassPass1()
  1970. {
  1971. Assert(containsSurrogates ? unicodeFlagPresent : true);
  1972. CharSet<codepoint_t> codePointSet;
  1973. MatchSetNode deferredSetNode(false, false);
  1974. MatchCharNode deferredCharNode(0);
  1975. bool isNegation = false;
  1976. if (ECLookahead() == '^')
  1977. {
  1978. isNegation = true;
  1979. ECConsume();
  1980. }
  1981. // We aren't expecting any terminating null characters, only embedded ones that should treated as valid characters.
  1982. // CharacterClassPass0 should have taken care of terminating null.
  1983. codepoint_t pendingCodePoint = INVALID_CODEPOINT;
  1984. codepoint_t pendingRangeStart = INVALID_CODEPOINT;
  1985. EncodedChar nextChar = ECLookahead();
  1986. bool previousWasASurrogate = false;
  1987. bool currIsACharSet = false;
  1988. bool prevWasACharSetAndPartOfRange = false;
  1989. bool prevprevWasACharSetAndPartOfRange = false;
  1990. while(nextChar != ']')
  1991. {
  1992. codepoint_t codePointToSet = INVALID_CODEPOINT;
  1993. // Consume ahead of time if we have two backslashes, both cases below (previously Tracked surrogate pair, and ClassEscapePass1) assume it is.
  1994. if (nextChar == '\\')
  1995. {
  1996. ECConsume();
  1997. }
  1998. // These if-blocks are the logical ClassAtomPass1, they weren't grouped into a method to simplify dealing with multiple out parameters.
  1999. if (containsSurrogates && this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
  2000. {
  2001. codePointToSet = pendingCodePoint;
  2002. pendingCodePoint = this->currentSurrogatePairNode->value;
  2003. Assert(ECCanConsume(this->currentSurrogatePairNode->length));
  2004. ECConsumeMultiUnit(this->currentSurrogatePairNode->length);
  2005. this->RestoreMultiUnits(this->currentSurrogatePairNode->multiUnits);
  2006. this->currentSurrogatePairNode = this->currentSurrogatePairNode->next;
  2007. }
  2008. else if (nextChar == '\\')
  2009. {
  2010. Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);
  2011. codePointToSet = pendingCodePoint;
  2012. if (returnedNode->tag == Node::MatchSet)
  2013. {
  2014. if (pendingRangeStart != INVALID_CODEPOINT)
  2015. {
  2016. if (unicodeFlagPresent)
  2017. {
  2018. //A range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
  2019. //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
  2020. Fail(JSERR_UnicodeRegExpRangeContainsCharClass); //From #sec-patterns-static-semantics-early-errors-annexb
  2021. }
  2022. codePointSet.Set(ctAllocator, '-');
  2023. }
  2024. pendingCodePoint = INVALID_CODEPOINT;
  2025. pendingRangeStart = INVALID_CODEPOINT;
  2026. codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);
  2027. currIsACharSet = true;
  2028. }
  2029. else
  2030. {
  2031. // Just a character
  2032. pendingCodePoint = deferredCharNode.cs[0];
  2033. }
  2034. }
  2035. else if (nextChar == '-')
  2036. {
  2037. if (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT || ECLookahead(1) == ']')
  2038. {
  2039. // - is just a char, or end of a range.
  2040. codePointToSet = pendingCodePoint;
  2041. pendingCodePoint = '-';
  2042. ECConsume();
  2043. }
  2044. else
  2045. {
  2046. pendingRangeStart = pendingCodePoint;
  2047. ECConsume();
  2048. }
  2049. }
  2050. else
  2051. {
  2052. // Just a character, consume it
  2053. codePointToSet = pendingCodePoint;
  2054. pendingCodePoint = NextChar();
  2055. }
  2056. if (codePointToSet != INVALID_CODEPOINT || prevprevWasACharSetAndPartOfRange)
  2057. {
  2058. if (prevprevWasACharSetAndPartOfRange)
  2059. {
  2060. //A range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
  2061. //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
  2062. if (unicodeFlagPresent)
  2063. {
  2064. Fail(JSERR_UnicodeRegExpRangeContainsCharClass);
  2065. }
  2066. if (pendingCodePoint != INVALID_CODEPOINT)
  2067. {
  2068. codePointSet.Set(ctAllocator, pendingCodePoint);
  2069. }
  2070. codePointSet.Set(ctAllocator, '-'); //Add '-' to set because a range was detected but turned out to be a union of character set with '-' and another atom.
  2071. pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
  2072. }
  2073. else if (pendingRangeStart != INVALID_CODEPOINT)
  2074. {
  2075. if (pendingRangeStart > pendingCodePoint)
  2076. {
  2077. //We have no unicodeFlag, but current range contains surrogates, thus we may end up having to throw a "Syntax" error here
  2078. //This breaks the notion of Pass0 check for valid syntax, because we don't know if we have a unicode option
  2079. Assert(!unicodeFlagPresent);
  2080. Fail(JSERR_RegExpBadRange);
  2081. }
  2082. codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
  2083. pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
  2084. }
  2085. else
  2086. {
  2087. codePointSet.Set(ctAllocator, codePointToSet);
  2088. }
  2089. }
  2090. nextChar = ECLookahead();
  2091. prevprevWasACharSetAndPartOfRange = prevWasACharSetAndPartOfRange;
  2092. prevWasACharSetAndPartOfRange = currIsACharSet && nextChar == '-';
  2093. currIsACharSet = false;
  2094. }
  2095. if (pendingCodePoint != INVALID_CODEPOINT)
  2096. {
  2097. codePointSet.Set(ctAllocator, pendingCodePoint);
  2098. }
  2099. // At this point, we have a complete set of codepoints representing the range.
  2100. // Before performing translation of any kind, we need to do some case filling.
  2101. // At the point of this comment, there are no case mappings going cross-plane between simple
  2102. // characters (< 0x10000) and supplementary characters (>= 0x10000)
  2103. // However, it might still be the case, and this has to be handled.
  2104. // On the other hand, we don't want to prevent optimizations that expect non-casefolded sets from happening.
  2105. // At least for simple characters.
  2106. // The simple case, is when the unicode flag isn't specified, we can go ahead and return the simple set.
  2107. // Negations and case mappings will be handled later.
  2108. if (!unicodeFlagPresent)
  2109. {
  2110. Assert(codePointSet.SimpleCharCount() == codePointSet.Count());
  2111. MatchSetNode *simpleToReturn = Anew(ctAllocator, MatchSetNode, isNegation);
  2112. codePointSet.CloneSimpleCharsTo(ctAllocator, simpleToReturn->set);
  2113. return simpleToReturn;
  2114. }
  2115. // Everything past here must be under the flag
  2116. Assert(scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  2117. if (codePointSet.IsEmpty())
  2118. {
  2119. return Anew(ctAllocator, MatchSetNode, false, false);
  2120. }
  2121. Node* prefixNode = nullptr;
  2122. Node* suffixNode = nullptr;
  2123. CharSet<codepoint_t> *toUseForTranslation = &codePointSet;
  2124. // If a singleton, return a simple character
  2125. bool isSingleton = !this->caseInsensitiveFlagPresent && !isNegation && codePointSet.IsSingleton();
  2126. if (isSingleton)
  2127. {
  2128. codepoint_t singleton = codePointSet.Singleton();
  2129. Node* toReturn = nullptr;
  2130. if (singleton < 0x10000)
  2131. {
  2132. toReturn = Anew(ctAllocator, MatchCharNode, (char16)singleton);
  2133. }
  2134. else
  2135. {
  2136. Assert(unicodeFlagPresent);
  2137. char16 lowerSurrogate, upperSurrogate;
  2138. Js::NumberUtilities::CodePointAsSurrogatePair(singleton, &lowerSurrogate, &upperSurrogate);
  2139. toReturn = CreateSurrogatePairAtom(lowerSurrogate, upperSurrogate);
  2140. }
  2141. codePointSet.Clear(ctAllocator);
  2142. return toReturn;
  2143. }
  2144. // If negation, we want to complement the simple chars.
  2145. // When a set is negated, optimizations skip checking if applicable, so we can go ahead and negate it here.
  2146. CharSet<codepoint_t> negatedSet;
  2147. if (!this->caseInsensitiveFlagPresent)
  2148. {
  2149. if (isNegation)
  2150. {
  2151. // Complement all characters, and use it as the set toTranslate
  2152. codePointSet.ToComplement(ctAllocator, negatedSet);
  2153. }
  2154. toUseForTranslation = isNegation ? &negatedSet : &codePointSet;
  2155. if (isNegation)
  2156. {
  2157. // Clear this, as we will no longer need this.
  2158. codePointSet.FreeBody(ctAllocator);
  2159. }
  2160. }
  2161. else
  2162. {
  2163. CharSet<codepoint_t> caseEquivalent;
  2164. codePointSet.ToEquivClass(ctAllocator, caseEquivalent);
  2165. // Equiv set can't have a reduced count of chars
  2166. Assert(caseEquivalent.Count() >= codePointSet.Count());
  2167. // Here we have a regex that has both case insensitive and unicode options.
  2168. // The range might also be negated. If it is negated, we can go ahead and negate
  2169. // the entire set as well as fill in cases, as optimizations wouldn't kick in anyways.
  2170. if (isNegation)
  2171. {
  2172. codePointSet.Clear(ctAllocator);
  2173. caseEquivalent.ToComplement(ctAllocator, codePointSet);
  2174. caseEquivalent.FreeBody(ctAllocator);
  2175. }
  2176. else
  2177. {
  2178. codePointSet.CloneFrom(ctAllocator, caseEquivalent);
  2179. }
  2180. Assert(toUseForTranslation == &codePointSet);
  2181. }
  2182. uint totalCodePointsCount = toUseForTranslation->Count();
  2183. uint simpleCharsCount = toUseForTranslation->SimpleCharCount();
  2184. if (totalCodePointsCount == simpleCharsCount)
  2185. {
  2186. MatchSetNode *simpleToReturn = Anew(ctAllocator, MatchSetNode, isNegation);
  2187. toUseForTranslation->CloneSimpleCharsTo(ctAllocator, simpleToReturn->set);
  2188. return simpleToReturn;
  2189. }
  2190. if (simpleCharsCount > 0)
  2191. {
  2192. if (!toUseForTranslation->ContainSurrogateCodeUnits())
  2193. {
  2194. MatchSetNode *node = Anew(ctAllocator, MatchSetNode, false, false);
  2195. toUseForTranslation->CloneSimpleCharsTo(ctAllocator, node->set);
  2196. prefixNode = node;
  2197. }
  2198. else
  2199. {
  2200. MatchSetNode *node = Anew(ctAllocator, MatchSetNode, false, false);
  2201. toUseForTranslation->CloneNonSurrogateCodeUnitsTo(ctAllocator, node->set);
  2202. prefixNode = node;
  2203. node = Anew(ctAllocator, MatchSetNode, false, false);
  2204. toUseForTranslation->CloneSurrogateCodeUnitsTo(ctAllocator, node->set);
  2205. suffixNode = node;
  2206. }
  2207. }
  2208. Assert(unicodeFlagPresent);
  2209. AltNode *headToReturn = prefixNode == nullptr ? nullptr : Anew(ctAllocator, AltNode, prefixNode, nullptr);
  2210. AltNode *currentTail = headToReturn;
  2211. codepoint_t charRangeSearchIndex = 0x10000, lowerCharOfRange = 0, upperCharOfRange = 0;
  2212. while (toUseForTranslation->GetNextRange(charRangeSearchIndex, &lowerCharOfRange, &upperCharOfRange))
  2213. {
  2214. if (lowerCharOfRange == upperCharOfRange)
  2215. {
  2216. currentTail = this->AppendSurrogatePairToDisjunction(lowerCharOfRange, currentTail);
  2217. }
  2218. else
  2219. {
  2220. currentTail = this->AppendSurrogateRangeToDisjunction(lowerCharOfRange, upperCharOfRange, currentTail);
  2221. }
  2222. if (headToReturn == nullptr)
  2223. {
  2224. headToReturn = currentTail;
  2225. }
  2226. AnalysisAssert(currentTail != nullptr);
  2227. while (currentTail->tail != nullptr)
  2228. {
  2229. currentTail = currentTail->tail;
  2230. }
  2231. charRangeSearchIndex = upperCharOfRange + 1;
  2232. }
  2233. if (suffixNode != nullptr)
  2234. {
  2235. currentTail->tail = Anew(ctAllocator, AltNode, suffixNode, nullptr);
  2236. }
  2237. toUseForTranslation->Clear(ctAllocator);
  2238. if (headToReturn != nullptr && headToReturn->tail == nullptr)
  2239. {
  2240. return headToReturn->head;
  2241. }
  2242. return headToReturn;
  2243. }
  2244. #pragma warning(push)
  2245. #pragma warning(disable:4702) // unreachable code
  2246. template <typename P, const bool IsLiteral>
  2247. bool Parser<P, IsLiteral>::ClassEscapePass0(Char& singleton, bool& previousSurrogatePart)
  2248. {
  2249. // Could be terminating 0
  2250. EncodedChar ec = ECLookahead();
  2251. if (ec == 0 && IsEOF())
  2252. {
  2253. Fail(JSERR_RegExpSyntax);
  2254. return false;
  2255. }
  2256. else if (standardEncodedChars->IsOctal(ec))
  2257. {
  2258. uint n = 0;
  2259. CharCount digits = 0;
  2260. do
  2261. {
  2262. uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
  2263. if (m > Chars<uint8>::MaxUChar) //Regex octal codes only support single byte (ASCII) characters.
  2264. break;
  2265. n = m;
  2266. ECConsume();
  2267. digits++;
  2268. }
  2269. while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not octal
  2270. singleton = UTC((UChar)n);
  2271. // Clear possible pair
  2272. this->tempLocationOfSurrogatePair = nullptr;
  2273. return true;
  2274. }
  2275. else
  2276. {
  2277. const EncodedChar* location = this->tempLocationOfSurrogatePair;
  2278. // Clear it for now, otherwise to many branches to clear it on.
  2279. this->tempLocationOfSurrogatePair = nullptr;
  2280. // An escaped '/' is ok
  2281. Char c = NextChar();
  2282. switch (c)
  2283. {
  2284. case 'b':
  2285. singleton = '\b';
  2286. return true;
  2287. case 'f':
  2288. singleton = '\f';
  2289. return true;
  2290. case 'n':
  2291. singleton = '\n';
  2292. return true;
  2293. case 'r':
  2294. singleton = '\r';
  2295. return true;
  2296. case 't':
  2297. singleton = '\t';
  2298. return true;
  2299. case 'v':
  2300. singleton = '\v';
  2301. return true;
  2302. case 'd':
  2303. case 'D':
  2304. case 's':
  2305. case 'S':
  2306. case 'w':
  2307. case 'W':
  2308. return false;
  2309. case 'c':
  2310. if (standardEncodedChars->IsLetter(ECLookahead())) // terminating 0 is not a letter
  2311. {
  2312. singleton = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
  2313. ECConsume();
  2314. }
  2315. else
  2316. {
  2317. DeferredFailIfUnicode(JSERR_RegExpInvalidEscape); // Fail in unicode mode for non-letter escaped control characters according to 262 Annex-B RegExp grammar spec #prod-annexB-Term
  2318. if (!IsEOF())
  2319. {
  2320. EncodedChar ecLookahead = ECLookahead();
  2321. switch (ecLookahead)
  2322. {
  2323. case '-':
  2324. case ']':
  2325. singleton = c;
  2326. break;
  2327. default:
  2328. singleton = UTC(Chars<EncodedChar>::CTU(ecLookahead) % 32);
  2329. ECConsume();
  2330. break;
  2331. }
  2332. }
  2333. else
  2334. singleton = c;
  2335. }
  2336. return true;
  2337. case 'x':
  2338. if (ECCanConsume(2) &&
  2339. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2340. standardEncodedChars->IsHex(ECLookahead(1)))
  2341. {
  2342. singleton = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
  2343. (standardEncodedChars->DigitValue(ECLookahead(1))));
  2344. ECConsume(2);
  2345. }
  2346. else
  2347. singleton = c;
  2348. return true;
  2349. case 'u':
  2350. this->tempLocationOfSurrogatePair = location;
  2351. if (this->TryParseExtendedUnicodeEscape(singleton, previousSurrogatePart, true) > 0)
  2352. return true;
  2353. else if (ECCanConsume(4) &&
  2354. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2355. standardEncodedChars->IsHex(ECLookahead(1)) &&
  2356. standardEncodedChars->IsHex(ECLookahead(2)) &&
  2357. standardEncodedChars->IsHex(ECLookahead(3)))
  2358. {
  2359. singleton = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  2360. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  2361. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  2362. (standardEncodedChars->DigitValue(ECLookahead(3))));
  2363. if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  2364. {
  2365. // Current location
  2366. TrackIfSurrogatePair(singleton, (next - 1), 5);
  2367. }
  2368. // The above if statement, if true, will clear tempLocationOfSurrogatePair if needs to.
  2369. ECConsume(4);
  2370. }
  2371. else
  2372. singleton = c;
  2373. return true;
  2374. default:
  2375. // embedded 0 is ok
  2376. singleton = c;
  2377. return true;
  2378. }
  2379. }
  2380. }
  2381. #pragma warning(pop)
  2382. template <typename P, const bool IsLiteral>
  2383. Node* Parser<P, IsLiteral>::ClassEscapePass1(MatchCharNode* deferredCharNode, MatchSetNode* deferredSetNode, bool& previousSurrogatePart)
  2384. {
  2385. // Checked for terminating 0 is pass 0
  2386. Assert(!IsEOF());
  2387. if (standardEncodedChars->IsOctal(ECLookahead()))
  2388. {
  2389. // As per Annex B, allow octal escapes instead of just \0 (and \8 and \9 are identity escapes).
  2390. // Must be between 1 and 3 octal digits.
  2391. uint n = 0;
  2392. CharCount digits = 0;
  2393. do
  2394. {
  2395. uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
  2396. if (m > Chars<uint8>::MaxUChar) //Regex octal codes only support single byte (ASCII) characters.
  2397. break;
  2398. n = m;
  2399. ECConsume();
  2400. digits++;
  2401. }
  2402. while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not octal
  2403. deferredCharNode->cs[0] = UTC((UChar)n);
  2404. return deferredCharNode;
  2405. }
  2406. else
  2407. {
  2408. Char c = NextChar();
  2409. switch (c)
  2410. {
  2411. case 'b':
  2412. c = '\b';
  2413. break; // fall-through for identity escape
  2414. case 'f':
  2415. c = '\f';
  2416. break; // fall-through for identity escape
  2417. case 'n':
  2418. c = '\n';
  2419. break; // fall-through for identity escape
  2420. case 'r':
  2421. c = '\r';
  2422. break; // fall-through for identity escape
  2423. case 't':
  2424. c = '\t';
  2425. break; // fall-through for identity escape
  2426. case 'v':
  2427. c = '\v';
  2428. break; // fall-through for identity escape
  2429. case 'd':
  2430. standardChars->SetDigits(ctAllocator, deferredSetNode->set);
  2431. return deferredSetNode;
  2432. case 'D':
  2433. standardChars->SetNonDigits(ctAllocator, deferredSetNode->set);
  2434. return deferredSetNode;
  2435. case 's':
  2436. standardChars->SetWhitespace(ctAllocator, deferredSetNode->set);
  2437. return deferredSetNode;
  2438. case 'S':
  2439. standardChars->SetNonWhitespace(ctAllocator, deferredSetNode->set);
  2440. return deferredSetNode;
  2441. case 'w':
  2442. standardChars->SetWordChars(ctAllocator, deferredSetNode->set);
  2443. return deferredSetNode;
  2444. case 'W':
  2445. standardChars->SetNonWordChars(ctAllocator, deferredSetNode->set);
  2446. return deferredSetNode;
  2447. case 'c':
  2448. if (standardEncodedChars->IsWord(ECLookahead())) // terminating 0 is not a word character
  2449. {
  2450. c = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
  2451. ECConsume();
  2452. // fall-through for identity escape
  2453. }
  2454. else
  2455. {
  2456. // If the lookahead is a non-alphanumeric and not an underscore ('_'), then treat '\' and 'c' separately.
  2457. //#sec-regular-expression-patterns-semantics
  2458. ECRevert(1); //Put cursor back at 'c' and treat it as a non-escaped character.
  2459. deferredCharNode->cs[0] = '\\';
  2460. return deferredCharNode;
  2461. }
  2462. break;
  2463. case 'x':
  2464. if (ECCanConsume(2) &&
  2465. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2466. standardEncodedChars->IsHex(ECLookahead(1)))
  2467. {
  2468. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
  2469. (standardEncodedChars->DigitValue(ECLookahead(1))));
  2470. ECConsume(2);
  2471. // fall-through for identity escape
  2472. }
  2473. // Take to be identity escape if ill-formed as per Annex B
  2474. break;
  2475. case 'u':
  2476. if (unicodeFlagPresent && TryParseExtendedUnicodeEscape(c, previousSurrogatePart) > 0)
  2477. break;
  2478. else if (ECCanConsume(4) &&
  2479. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2480. standardEncodedChars->IsHex(ECLookahead(1)) &&
  2481. standardEncodedChars->IsHex(ECLookahead(2)) &&
  2482. standardEncodedChars->IsHex(ECLookahead(3)))
  2483. {
  2484. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  2485. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  2486. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  2487. (standardEncodedChars->DigitValue(ECLookahead(3))));
  2488. ECConsume(4);
  2489. // fall-through for identity escape
  2490. }
  2491. // Take to be identity escape if ill-formed as per Annex B.
  2492. break;
  2493. default:
  2494. // As per Annex B, allow anything other than newlines and above. Embedded 0 is ok.
  2495. break;
  2496. }
  2497. // Must be an identity escape
  2498. deferredCharNode->cs[0] = c;
  2499. return deferredCharNode;
  2500. }
  2501. }
  2502. //
  2503. // Options
  2504. //
  2505. template <typename P, const bool IsLiteral>
  2506. void Parser<P, IsLiteral>::Options(RegexFlags& flags)
  2507. {
  2508. while (true)
  2509. {
  2510. // Could be terminating 0
  2511. EncodedChar ec = ECLookahead();
  2512. CharCount consume;
  2513. Char c;
  2514. if (ec == 0)
  2515. // Embedded 0 not valid
  2516. return;
  2517. else if (IsLiteral &&
  2518. ec == '\\' &&
  2519. ECCanConsume(6) &&
  2520. ECLookahead(1) == 'u' &&
  2521. standardEncodedChars->IsHex(ECLookahead(2)) &&
  2522. standardEncodedChars->IsHex(ECLookahead(3)) &&
  2523. standardEncodedChars->IsHex(ECLookahead(4)) &&
  2524. standardEncodedChars->IsHex(ECLookahead(5)))
  2525. {
  2526. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  2527. {
  2528. Fail(JSERR_RegExpSyntax);
  2529. return;
  2530. }
  2531. else
  2532. {
  2533. uint32 n = (standardEncodedChars->DigitValue(ECLookahead(2)) << 12) |
  2534. (standardEncodedChars->DigitValue(ECLookahead(3)) << 8) |
  2535. (standardEncodedChars->DigitValue(ECLookahead(4)) << 4) |
  2536. (standardEncodedChars->DigitValue(ECLookahead(5)));
  2537. c = UTC(n);
  2538. consume = 6;
  2539. }
  2540. }
  2541. else
  2542. {
  2543. c = Chars<EncodedChar>::CTW(ec);
  2544. consume = 1;
  2545. }
  2546. switch (c) {
  2547. case 'i':
  2548. if ((flags & IgnoreCaseRegexFlag) != 0)
  2549. {
  2550. Fail(JSERR_RegExpSyntax);
  2551. }
  2552. flags = (RegexFlags)(flags | IgnoreCaseRegexFlag);
  2553. break;
  2554. case 'g':
  2555. if ((flags & GlobalRegexFlag) != 0)
  2556. {
  2557. Fail(JSERR_RegExpSyntax);
  2558. }
  2559. flags = (RegexFlags)(flags | GlobalRegexFlag);
  2560. break;
  2561. case 'm':
  2562. if ((flags & MultilineRegexFlag) != 0)
  2563. {
  2564. Fail(JSERR_RegExpSyntax);
  2565. }
  2566. flags = (RegexFlags)(flags | MultilineRegexFlag);
  2567. break;
  2568. case 's':
  2569. if (scriptContext->GetConfig()->IsES2018RegExDotAllEnabled())
  2570. {
  2571. if ((flags & DotAllRegexFlag) != 0)
  2572. {
  2573. Fail(JSERR_RegExpSyntax);
  2574. }
  2575. flags = (RegexFlags)(flags | DotAllRegexFlag);
  2576. break;
  2577. }
  2578. case 'u':
  2579. // If we don't have unicode enabled, fall through to default
  2580. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  2581. {
  2582. if ((flags & UnicodeRegexFlag) != 0)
  2583. {
  2584. Fail(JSERR_RegExpSyntax);
  2585. }
  2586. flags = (RegexFlags)(flags | UnicodeRegexFlag);
  2587. // For telemetry
  2588. CHAKRATEL_LANGSTATS_INC_LANGFEATURECOUNT(ES6, UnicodeRegexFlag, scriptContext);
  2589. break;
  2590. }
  2591. case 'y':
  2592. if (scriptContext->GetConfig()->IsES6RegExStickyEnabled())
  2593. {
  2594. if ((flags & StickyRegexFlag) != 0)
  2595. {
  2596. Fail(JSERR_RegExpSyntax);
  2597. }
  2598. flags = (RegexFlags)(flags | StickyRegexFlag);
  2599. // For telemetry
  2600. CHAKRATEL_LANGSTATS_INC_LANGFEATURECOUNT(ES6, StickyRegexFlag, scriptContext);
  2601. break;
  2602. }
  2603. default:
  2604. if (standardChars->IsWord(c))
  2605. {
  2606. // Outer context could never parse this character. Signal the syntax error as
  2607. // being part of the regex.
  2608. Fail(JSERR_RegExpSyntax);
  2609. }
  2610. return;
  2611. }
  2612. ECConsume(consume);
  2613. }
  2614. }
  2615. //
  2616. // Entry points
  2617. //
  2618. template <typename P, const bool IsLiteral>
  2619. Node* Parser<P, IsLiteral>::ParseDynamic
  2620. ( const EncodedChar* body
  2621. , const EncodedChar* bodyLim
  2622. , const EncodedChar* opts
  2623. , const EncodedChar* optsLim
  2624. , RegexFlags& flags )
  2625. {
  2626. Assert(!IsLiteral);
  2627. Assert(body != 0);
  2628. Assert(bodyLim >= body && *bodyLim == 0);
  2629. Assert(opts == 0 || (optsLim >= opts && *optsLim == 0));
  2630. // Body, pass 0
  2631. SetPosition(body, bodyLim, true);
  2632. PatternPass0();
  2633. if (!IsEOF())
  2634. Fail(JSERR_RegExpSyntax);
  2635. // Options
  2636. if (opts != 0)
  2637. {
  2638. SetPosition(opts, optsLim, false);
  2639. Options(flags);
  2640. if (!IsEOF())
  2641. Fail(JSERR_RegExpSyntax);
  2642. this->unicodeFlagPresent = (flags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
  2643. this->caseInsensitiveFlagPresent = (flags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
  2644. this->dotAllFlagPresent = (flags & UnifiedRegex::DotAllRegexFlag) == UnifiedRegex::DotAllRegexFlag;
  2645. Assert(!this->unicodeFlagPresent || scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  2646. Assert(!this->dotAllFlagPresent || scriptContext->GetConfig()->IsES2018RegExDotAllEnabled());
  2647. }
  2648. else
  2649. {
  2650. this->unicodeFlagPresent = false;
  2651. this->caseInsensitiveFlagPresent = false;
  2652. this->dotAllFlagPresent = false;
  2653. }
  2654. // If this HR has been set, that means we have an earlier failure than the one caught above.
  2655. if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
  2656. {
  2657. throw ParseError(*deferredIfNotUnicodeError);
  2658. }
  2659. else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
  2660. {
  2661. throw ParseError(*deferredIfUnicodeError);
  2662. }
  2663. this->currentSurrogatePairNode = this->surrogatePairList;
  2664. // Body, pass 1
  2665. SetPosition(body, bodyLim, true);
  2666. Node* root = PatternPass1();
  2667. Assert(IsEOF());
  2668. return root;
  2669. }
  2670. template <typename P, const bool IsLiteral>
  2671. Node* Parser<P, IsLiteral>::ParseLiteral
  2672. ( const EncodedChar* input
  2673. , const EncodedChar* inputLim
  2674. , CharCount& outBodyEncodedChars
  2675. , CharCount& outTotalEncodedChars
  2676. , CharCount& outBodyChars
  2677. , CharCount& outTotalChars
  2678. , RegexFlags& flags )
  2679. {
  2680. Assert(IsLiteral);
  2681. Assert(input != 0);
  2682. Assert(inputLim >= input); // *inputLim need not be 0 because of deferred parsing
  2683. // To handle surrogate pairs properly under unicode option, we will collect information on location of the pairs
  2684. // during pass 0, regardless if the option is present. (We aren't able to get it at that time)
  2685. // During pass 1, we will use that information to correctly create appropriate nodes.
  2686. // Body, pass 0
  2687. SetPosition(input, inputLim, true);
  2688. PatternPass0();
  2689. outBodyEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2690. outBodyChars = Pos();
  2691. // Options are needed for the next pass
  2692. ECMust('/', ERRnoSlash);
  2693. Options(flags);
  2694. this->unicodeFlagPresent = (flags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
  2695. this->caseInsensitiveFlagPresent = (flags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
  2696. this->dotAllFlagPresent = (flags & UnifiedRegex::DotAllRegexFlag) == UnifiedRegex::DotAllRegexFlag;
  2697. Assert(!this->unicodeFlagPresent || scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  2698. // If this HR has been set, that means we have an earlier failure than the one caught above.
  2699. if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
  2700. {
  2701. throw ParseError(*deferredIfNotUnicodeError);
  2702. }
  2703. else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
  2704. {
  2705. throw ParseError(*deferredIfUnicodeError);
  2706. }
  2707. // Used below to proceed to the end of the regex
  2708. const EncodedChar *pastOptions = next;
  2709. this->currentSurrogatePairNode = this->surrogatePairList;
  2710. // Body, pass 1
  2711. SetPosition(input, inputLim, true);
  2712. Node* root = PatternPass1();
  2713. Assert(outBodyEncodedChars == Chars<EncodedChar>::OSB(next, input));
  2714. Assert(outBodyChars == Pos());
  2715. next = pastOptions;
  2716. outTotalEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2717. outTotalChars = Pos();
  2718. return root;
  2719. }
  2720. template <typename P, const bool IsLiteral>
  2721. void Parser<P, IsLiteral>::ParseLiteralNoAST
  2722. ( const EncodedChar* input
  2723. , const EncodedChar* inputLim
  2724. , CharCount& outBodyEncodedChars
  2725. , CharCount& outTotalEncodedChars
  2726. , CharCount& outBodyChars
  2727. , CharCount& outTotalChars )
  2728. {
  2729. Assert(IsLiteral);
  2730. Assert(input != 0);
  2731. Assert(inputLim >= input); // *inputLim need not be 0 because of deferred parsing
  2732. // Body, pass 0
  2733. SetPosition(input, inputLim, true);
  2734. PatternPass0();
  2735. outBodyEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2736. outBodyChars = Pos();
  2737. // Options
  2738. ECMust('/', ERRnoSlash);
  2739. RegexFlags dummyFlags = NoRegexFlags;
  2740. Options(dummyFlags);
  2741. this->unicodeFlagPresent = (dummyFlags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
  2742. this->caseInsensitiveFlagPresent = (dummyFlags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
  2743. this->dotAllFlagPresent = (dummyFlags & UnifiedRegex::DotAllRegexFlag) == UnifiedRegex::DotAllRegexFlag;
  2744. outTotalEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2745. outTotalChars = Pos();
  2746. // If this HR has been set, that means we have an earlier failure than the one caught above.
  2747. if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
  2748. {
  2749. throw ParseError(*deferredIfNotUnicodeError);
  2750. }
  2751. else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
  2752. {
  2753. throw ParseError(*deferredIfUnicodeError);
  2754. }
  2755. }
  2756. template <typename P, const bool IsLiteral>
  2757. template <const bool buildAST>
  2758. RegexPattern * Parser<P, IsLiteral>::CompileProgram
  2759. ( Node* root,
  2760. const EncodedChar*& currentCharacter,
  2761. const CharCount totalLen,
  2762. const CharCount bodyChars,
  2763. const CharCount bodyEncodedChars,
  2764. const CharCount totalChars,
  2765. const RegexFlags flags )
  2766. {
  2767. Assert(IsLiteral);
  2768. Program* program = nullptr;
  2769. if (buildAST)
  2770. {
  2771. const auto recycler = this->scriptContext->GetRecycler();
  2772. program = Program::New(recycler, flags);
  2773. this->CaptureSourceAndGroups(recycler, program, currentCharacter, bodyChars, bodyEncodedChars);
  2774. }
  2775. currentCharacter += totalLen;
  2776. Assert(GetMultiUnits() == totalLen - totalChars);
  2777. if (!buildAST)
  2778. {
  2779. return nullptr;
  2780. }
  2781. RegexPattern* pattern = RegexPattern::New(this->scriptContext, program, true);
  2782. #if ENABLE_REGEX_CONFIG_OPTIONS
  2783. RegexStats* stats = 0;
  2784. if (REGEX_CONFIG_FLAG(RegexProfile))
  2785. {
  2786. stats = this->scriptContext->GetRegexStatsDatabase()->GetRegexStats(pattern);
  2787. this->scriptContext->GetRegexStatsDatabase()->EndProfile(stats, RegexStats::Parse);
  2788. }
  2789. if (REGEX_CONFIG_FLAG(RegexTracing))
  2790. {
  2791. DebugWriter* tw = this->scriptContext->GetRegexDebugWriter();
  2792. tw->Print(_u("// REGEX COMPILE "));
  2793. pattern->Print(tw);
  2794. tw->EOL();
  2795. }
  2796. if (REGEX_CONFIG_FLAG(RegexProfile))
  2797. this->scriptContext->GetRegexStatsDatabase()->BeginProfile();
  2798. #endif
  2799. ArenaAllocator* rtAllocator = this->scriptContext->RegexAllocator();
  2800. Compiler::Compile
  2801. ( this->scriptContext
  2802. , ctAllocator
  2803. , rtAllocator
  2804. , standardChars
  2805. , program
  2806. , root
  2807. , this->GetLitbuf()
  2808. , pattern
  2809. #if ENABLE_REGEX_CONFIG_OPTIONS
  2810. , w
  2811. , stats
  2812. #endif
  2813. );
  2814. #if ENABLE_REGEX_CONFIG_OPTIONS
  2815. if (REGEX_CONFIG_FLAG(RegexProfile))
  2816. this->scriptContext->GetRegexStatsDatabase()->EndProfile(stats, RegexStats::Compile);
  2817. #endif
  2818. #ifdef PROFILE_EXEC
  2819. this->scriptContext->ProfileEnd(Js::RegexCompilePhase);
  2820. #endif
  2821. // CaptureSourceAndGroups throws if this condition doesn't hold.
  2822. Assert(0 < pattern->NumGroups() && pattern->NumGroups() <= MAX_NUM_GROUPS);
  2823. return pattern;
  2824. }
  2825. template <typename P, const bool IsLiteral>
  2826. void Parser<P, IsLiteral>::CaptureEmptySourceAndNoGroups(Program* program)
  2827. {
  2828. Assert(program->source == 0);
  2829. program->source = const_cast<Char*>(_u(""));
  2830. program->sourceLen = 0;
  2831. program->numGroups = 1;
  2832. // Remaining to set during compilation: litbuf, litbufLen, numLoops, insts, instsLen, entryPointLabel
  2833. }
  2834. template <typename P, const bool IsLiteral>
  2835. void Parser<P, IsLiteral>::CaptureSourceAndGroups(Recycler* recycler, Program* program, const EncodedChar* body, CharCount bodyChars, CharCount bodyEncodedChars)
  2836. {
  2837. Assert(program->source == 0);
  2838. Assert(body != 0);
  2839. // Program will own source string
  2840. program->source = RecyclerNewArrayLeaf(recycler, Char, bodyChars + 1);
  2841. // Don't need to zero out since we're writing to the buffer right here
  2842. this->ConvertToUnicode(program->source, bodyChars, body, body + bodyEncodedChars);
  2843. program->source[bodyChars] = 0;
  2844. program->sourceLen = bodyChars;
  2845. // We expect nextGroupId to be positive, because the full regexp itself always
  2846. // counts as a capturing group.
  2847. Assert(nextGroupId > 0);
  2848. if (nextGroupId > MAX_NUM_GROUPS)
  2849. {
  2850. Js::JavascriptError::ThrowRangeError(this->scriptContext, JSERR_RegExpTooManyCapturingGroups);
  2851. }
  2852. else
  2853. {
  2854. program->numGroups = static_cast<uint16>(nextGroupId);
  2855. }
  2856. // Remaining to set during compilation: litbuf, litbufLen, numLoops, insts, instsLen, entryPointLabel
  2857. }
  2858. template <typename P, const bool IsLiteral>
  2859. Node* Parser<P, IsLiteral>::GetNodeWithValidCharacterSet(EncodedChar cc)
  2860. {
  2861. Node* nodeToReturn = nullptr;
  2862. if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() && this->unicodeFlagPresent)
  2863. {
  2864. MatchSetNode *lowerRangeNode = Anew(ctAllocator, MatchSetNode, false, false);
  2865. lowerRangeNode->set.SetRange(ctAllocator, (Char)0xD800, (Char)0xDBFF);
  2866. MatchSetNode *upperRangeNode = Anew(ctAllocator, MatchSetNode, false, false);
  2867. upperRangeNode->set.SetRange(ctAllocator, (Char)0xDC00, (Char)0xDFFF);
  2868. ConcatNode* surrogateRangePairNode = Anew(ctAllocator, ConcatNode, lowerRangeNode, Anew(ctAllocator, ConcatNode, upperRangeNode, nullptr));
  2869. // The MatchSet node will be split into [0-D7FFDC00-FFFF] (minus special characters like newline, whitespace, etc.) as a prefix, and a suffix of [D800-DBFF]
  2870. // i.e. The MatchSet node can be with [0-D7FFDC00-FFFF] (minus special characters like newline, whitespace, etc.) OR [D800-DBFF]
  2871. MatchSetNode* partialPrefixSetNode = Anew(ctAllocator, MatchSetNode, false, false);
  2872. switch (cc)
  2873. {
  2874. case '.':
  2875. if (this->dotAllFlagPresent)
  2876. {
  2877. standardChars->SetFullSet(ctAllocator, partialPrefixSetNode->set);
  2878. }
  2879. else
  2880. {
  2881. standardChars->SetNonNewline(ctAllocator, partialPrefixSetNode->set);
  2882. }
  2883. break;
  2884. case 'S':
  2885. standardChars->SetNonWhitespace(ctAllocator, partialPrefixSetNode->set);
  2886. break;
  2887. case 'D':
  2888. standardChars->SetNonDigits(ctAllocator, partialPrefixSetNode->set);
  2889. break;
  2890. case 'W':
  2891. if (this->caseInsensitiveFlagPresent)
  2892. {
  2893. standardChars->SetNonWordIUChars(ctAllocator, partialPrefixSetNode->set);
  2894. }
  2895. else
  2896. {
  2897. standardChars->SetNonWordChars(ctAllocator, partialPrefixSetNode->set);
  2898. }
  2899. break;
  2900. default:
  2901. AssertMsg(false, "");
  2902. }
  2903. partialPrefixSetNode->set.SubtractRange(ctAllocator, (Char)0xD800u, (Char)0xDBFFu);
  2904. MatchSetNode* partialSuffixSetNode = Anew(ctAllocator, MatchSetNode, false, false);
  2905. partialSuffixSetNode->set.SetRange(ctAllocator, (Char)0xD800u, (Char)0xDBFFu);
  2906. AltNode* altNode = Anew(ctAllocator, AltNode, partialPrefixSetNode, Anew(ctAllocator, AltNode, surrogateRangePairNode, Anew(ctAllocator, AltNode, partialSuffixSetNode, nullptr)));
  2907. nodeToReturn = altNode;
  2908. }
  2909. else
  2910. {
  2911. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
  2912. switch (cc)
  2913. {
  2914. case '.':
  2915. if (this->dotAllFlagPresent)
  2916. {
  2917. standardChars->SetFullSet(ctAllocator, setNode->set);
  2918. }
  2919. else
  2920. {
  2921. standardChars->SetNonNewline(ctAllocator, setNode->set);
  2922. }
  2923. break;
  2924. case 'S':
  2925. standardChars->SetNonWhitespace(ctAllocator, setNode->set);
  2926. break;
  2927. case 'D':
  2928. standardChars->SetNonDigits(ctAllocator, setNode->set);
  2929. break;
  2930. case 'W':
  2931. standardChars->SetNonWordChars(ctAllocator, setNode->set);
  2932. break;
  2933. default:
  2934. AssertMsg(false, "");
  2935. }
  2936. nodeToReturn = setNode;
  2937. }
  2938. return nodeToReturn;
  2939. }
  2940. template <typename P, const bool IsLiteral>
  2941. void Parser<P, IsLiteral>::FreeBody()
  2942. {
  2943. if (litbuf != 0)
  2944. {
  2945. ctAllocator->Free(litbuf, litbufLen);
  2946. litbuf = nullptr;
  2947. litbufLen = 0;
  2948. litbufNext = 0;
  2949. }
  2950. }
  2951. // Instantiate all templates
  2952. #define INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, IsLiteral, BuildAST) \
  2953. template RegexPattern* Parser<EncodingPolicy, IsLiteral>::CompileProgram<BuildAST>(Node* root, const EncodedChar*& currentCharacter, const CharCount totalLen, const CharCount bodyChars, const CharCount bodyEncodedChars, const CharCount totalChars, const RegexFlags flags );
  2954. #define INSTANTIATE_REGEX_PARSER(EncodingPolicy) \
  2955. INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, false, false) \
  2956. INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, false, true) \
  2957. INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, true, false) \
  2958. INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, true, true) \
  2959. template class Parser<EncodingPolicy, false>; \
  2960. template class Parser<EncodingPolicy, true>;
  2961. // Instantiate the Parser
  2962. INSTANTIATE_REGEX_PARSER(NullTerminatedUnicodeEncodingPolicy);
  2963. INSTANTIATE_REGEX_PARSER(NotNullTerminatedUTF8EncodingPolicy);
  2964. }