RegexParser.cpp 127 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. /*
  6. --------------------------------------------------------------------------------------------------------------------------------
  7. Original
  8. --------------------------------------------------------------------------------------------------------------------------------
  9. Pattern ::= Disjunction
  10. Disjunction ::= Alternative | Alternative '|' Disjunction
  11. Alternative ::= [empty] | Alternative Term
  12. Term ::= Assertion | Atom | Atom Quantifier
  13. Assertion ::= '^' | '$' | '\' 'b' | '\' 'B' | '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
  14. Quantifier ::= QuantifierPrefix | QuantifierPrefix '?'
  15. QuantifierPrefix ::= '*' | '+' | '?' | '{' DecimalDigits '}' | '{' DecimalDigits ',' '}' | '{' DecimalDigits ',' DecimalDigits '}'
  16. Atom ::= PatternCharacter | '.' | '\' AtomEscape | CharacterClass | '(' Disjunction ')' | '(' '?' ':' Disjunction ')'
  17. PatternCharacter ::= SourceCharacter but not any of { '^', '$', '\', '.', '*', '+', '?', '(', ')', '[', ']', '{', '}', '|' }
  18. AtomEscape ::= DecimalEscape | CharacterEscape | CharacterClassEscape
  19. CharacterEscape ::= ControlEscape | 'c' ControlLetter | HexEscapeSequence | UnicodeEscapeSequence | IdentityEscape
  20. ControlEscape ::= one of { 'f', 'n', 'r', 't', 'v' }
  21. ControlLetter ::= one of { 'a'..'z', 'A'..'Z' }
  22. IdentityEscape ::= (SourceCharacter but not IdentifierPart) | <ZWJ> | <ZWNJ>
  23. DecimalEscape ::= DecimalIntegerLiteral [lookahead not in DecimalDigit]
  24. CharacterClassEscape :: one of { 'd', 'D', 's', 'S', 'w', 'W' }
  25. CharacterClass ::= '[' [lookahead not in {'^'}] ClassRanges ']' | '[' '^' ClassRanges ']'
  26. ClassRanges ::= [empty] | NonemptyClassRanges
  27. NonemptyClassRanges ::= ClassAtom | ClassAtom NonemptyClassRangesNoDash | ClassAtom '-' ClassAtom ClassRanges
  28. NonemptyClassRangesNoDash ::= ClassAtom | ClassAtomNoDash NonemptyClassRangesNoDash | ClassAtomNoDash '-' ClassAtom ClassRanges
  29. ClassAtom ::= '-' | ClassAtomNoDash
  30. ClassAtomNoDash ::= SourceCharacter but not one of { '\', ']', '-' } | '\' ClassEscape
  31. ClassEscape ::= DecimalEscape | 'b' | CharacterEscape | CharacterClassEscape
  32. SourceCharacter ::= <unicode character>
  33. HexEscapeSequence ::= 'x' HexDigit HexDigit
  34. UnicodeEscapeSequence ::= 'u' HexDigit HexDigit HexDigit HexDigit | 'u' '{' HexDigits '}'
  35. HexDigit ::= one of { '0'..'9', 'a'..'f', 'A'..'F' }
  36. IdentifierStart ::= UnicodeLetter | '$' | '_' | '\' UnicodeEscapeSequence
  37. IdentifierPart ::= IdentifierStart | UnicodeCombiningMark | UnicodeDigit | UnicodeConnectorPunctuation | <ZWNJ> | <ZWJ>
  38. UnicodeLetter ::= <unicode Uppercase letter> | <unicode Lowercase letter> | <unicode Titlecase letter> | <unicode Modifier letter> | <unicode Other letter> | <unicode Letter number>
  39. UnicodeCombiningMark = <unicode Non-spacing mark> | <unicode combining spacing mark>
  40. UnicodeDigit ::= <unicode Decimal number>
  41. UnicodeConnectorPunctuation ::= <unicode Connector punctuation>
  42. DecimalIntegerLiteral ::= '0' | NonZeroDigit DecimalDigits?
  43. DecimalDigits ::= DecimalDigit | DecimalDigits DecimalDigit
  44. DecimalDigit ::= one of { '0'..'9' }
  45. NonZeroDigit ::= one of { '1'..'9' }
  46. ------------------
  47. Annex B Deviations
  48. ------------------
  49. 1. The assertions (?= ) and (?! ) are treated as though they have a surrounding non-capture group, and hence can be quantified.
  50. Other assertions are not quantifiable.
  51. QuantifiableAssertion [added] ::= '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
  52. Assertion [replaced] ::= '^' | '$' | '\' 'b' | '\' 'B' | QuantifiableAssertion
  53. Term ::= ... | QuantifiableAssertion Quantifier [added]
  54. --------------------------------------------------------------------------------------------------------------------------------
  55. Left factored
  56. --------------------------------------------------------------------------------------------------------------------------------
  57. Pattern ::= Disjunction
  58. Disjunction ::= Alternative ('|' Alternative)*
  59. FOLLOW(Disjunction) = { <eof>, ')' }
  60. Alternative ::= Term*
  61. FOLLOW(Alternative) = { <eof>, ')', '|' }
  62. Term ::= ( '^' | '$' | '\' 'b' | '\' 'B' | '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')' | Atom Quantifier? )
  63. | ( PatternCharacter | '.' | '\' AtomEscape | CharacterClass | '(' Disjunction ')' | '(' '?' ':' Disjunction ')' )
  64. ( '*' | '+' | '?' | '{' DecimalDigits (',' DecimalDigits?)? '}' ) '?'?
  65. PatternCharacter ::= <unicode character> but not any of { '^', '$', '\', '.', '*', '+', '?', '(', ')', '[', ']', '{', '}', '|' }
  66. AtomEscape ::= DecimalEscape | CharacterEscape | CharacterClassEscape
  67. CharacterEscape ::= ControlEscape | 'c' ControlLetter | HexEscapeSequence | UnicodeEscapeSequence | IdentityEscape
  68. ControlEscape ::= one of { 'f', 'n', 'r', 't', 'v' }
  69. ControlLetter ::= one of { 'a'..'z', 'A'..'Z' }
  70. IdentityEscape ::= <unicode character> but not <unicode Uppercase letter>, <unicode Lowercase letter>, <unicode Titlecase letter>, <unicode Modifier letter>, <unicode Other letter>, <unicode Letter number>, '$', '_', <unicode Non-spacing mark>, <unicode combining spacing mark>, <unicode Decimal number>, <unicode Connector punctuation>
  71. DecimalEscape ::= DecimalIntegerLiteral [lookahead not in DecimalDigit]
  72. CharacterClassEscape :: one of { 'd', 'D', 's', 'S', 'w', 'W' }
  73. CharacterClass ::= '[' [lookahead not in {'^'}] ClassRanges ']' | '[' '^' ClassRanges ']'
  74. ClassRanges ::= [empty] | NonemptyClassRanges
  75. NonemptyClassRanges ::= ClassAtom | ClassAtom NonemptyClassRangesNoDash | ClassAtom '-' ClassAtom ClassRanges
  76. NonemptyClassRangesNoDash ::= ClassAtom | ClassAtomNoDash NonemptyClassRangesNoDash | ClassAtomNoDash '-' ClassAtom ClassRanges
  77. ClassAtom ::= '-' | ClassAtomNoDash
  78. ClassAtomNoDash ::= SourceCharacter but not one of { '\', ']', '-' } | '\' ClassEscape
  79. ClassEscape ::= DecimalEscape | 'b' | CharacterEscape | CharacterClassEscape
  80. HexEscapeSequence ::= 'x' HexDigit{2}
  81. UnicodeEscapeSequence ::= 'u' HexDigit{4} | 'u' '{' HexDigits{4-6} '}'
  82. HexDigit ::= one of { '0'..'9', 'a'..'f', 'A'..'F' }
  83. DecimalIntegerLiteral ::= '0' | NonZeroDigit DecimalDigits?
  84. DecimalDigits ::= DecimalDigit+
  85. DecimalDigit ::= one of { '0'..'9' }
  86. NonZeroDigit ::= one of { '1'..'9' }
  87. ------------------
  88. Annex B Deviations
  89. ------------------
  90. QuantifiableAssertion [added] ::= '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
  91. Term ::= ... | '(' '?' '=' Disjunction ')' [removed] | '(' '?' '!' Disjunction ')' [removed] | QuantifiableAssertion Quantifier? [added]
  92. */
  93. #include "ParserPch.h"
  94. namespace UnifiedRegex
  95. {
  96. template <typename P, const bool IsLiteral>
  97. const CharCount Parser<P, IsLiteral>::initLitbufSize;
  98. ParseError::ParseError(bool isBody, CharCount pos, CharCount encodedPos, HRESULT error)
  99. : isBody(isBody), pos(pos), encodedPos(encodedPos), error(error)
  100. {
  101. }
  102. template <typename P, const bool IsLiteral>
  103. Parser<P, IsLiteral>::Parser
  104. ( Js::ScriptContext* scriptContext
  105. , ArenaAllocator* ctAllocator
  106. , StandardChars<EncodedChar>* standardEncodedChars
  107. , StandardChars<Char>* standardChars
  108. , bool isUtf8
  109. #if ENABLE_REGEX_CONFIG_OPTIONS
  110. , DebugWriter* w
  111. #endif
  112. )
  113. : scriptContext(scriptContext)
  114. , ctAllocator(ctAllocator)
  115. , standardEncodedChars(standardEncodedChars)
  116. , standardChars(standardChars)
  117. #if ENABLE_REGEX_CONFIG_OPTIONS
  118. , w(w)
  119. #endif
  120. , input(0)
  121. , inputLim(0)
  122. , next(0)
  123. , inBody(false)
  124. , numGroups(1)
  125. , nextGroupId(1) // implicit overall group always takes index 0
  126. , litbuf(0)
  127. , litbufLen(0)
  128. , litbufNext(0)
  129. , surrogatePairList(nullptr)
  130. , currentSurrogatePairNode(nullptr)
  131. , tempLocationOfSurrogatePair(nullptr)
  132. , tempLocationOfRange(nullptr)
  133. , codePointAtTempLocation(0)
  134. , unicodeFlagPresent(false)
  135. , caseInsensitiveFlagPresent(false)
  136. , positionAfterLastSurrogate(nullptr)
  137. , valueOfLastSurrogate(INVALID_CODEPOINT)
  138. , deferredIfNotUnicodeError(nullptr)
  139. , deferredIfUnicodeError(nullptr)
  140. {
  141. this->SetIsUtf8(isUtf8);
  142. }
  143. //
  144. // Input buffer management
  145. //
  146. template <typename P, const bool IsLiteral>
  147. void Parser<P, IsLiteral>::SetPosition(const EncodedChar* input, const EncodedChar* inputLim, bool inBody)
  148. {
  149. this->input = input;
  150. this->inputLim = inputLim;
  151. next = input;
  152. this->inBody = inBody;
  153. this->RestoreMultiUnits(0);
  154. }
  155. template <typename P, const bool IsLiteral>
  156. inline CharCount Parser<P, IsLiteral>::Pos()
  157. {
  158. CharCount nextOffset = Chars<EncodedChar>::OSB(next, input);
  159. Assert(nextOffset >= this->m_cMultiUnits);
  160. return nextOffset - (CharCount) this->m_cMultiUnits;
  161. }
  162. template <typename P, const bool IsLiteral>
  163. inline bool Parser<P, IsLiteral>::IsEOF()
  164. {
  165. return next >= inputLim;
  166. }
  167. template <typename P, const bool IsLiteral>
  168. inline bool Parser<P, IsLiteral>::ECCanConsume(CharCount n /*= 1*/)
  169. {
  170. return next + n <= inputLim;
  171. }
  172. template <typename P, const bool IsLiteral>
  173. inline typename P::EncodedChar Parser<P, IsLiteral>::ECLookahead(CharCount n /*= 0*/)
  174. {
  175. // Ok to look ahead to terminating 0
  176. Assert(next + n <= inputLim);
  177. return next[n];
  178. }
  179. template <typename P, const bool IsLiteral>
  180. inline typename P::EncodedChar Parser<P, IsLiteral>::ECLookback(CharCount n /*= 0*/)
  181. {
  182. // Ok to look ahead to terminating 0
  183. Assert(n + input <= next);
  184. return *(next - n);
  185. }
  186. template <typename P, const bool IsLiteral>
  187. inline void Parser<P, IsLiteral>::ECConsume(CharCount n /*= 1*/)
  188. {
  189. Assert(next + n <= inputLim);
  190. #if DBG
  191. for (CharCount i = 0; i < n; i++)
  192. Assert(!this->IsMultiUnitChar(next[i]));
  193. #endif
  194. next += n;
  195. }
  196. template <typename P, const bool IsLiteral>
  197. inline void Parser<P, IsLiteral>::ECConsumeMultiUnit(CharCount n /*= 1*/)
  198. {
  199. Assert(next + n <= inputLim);
  200. next += n;
  201. }
  202. template <typename P, const bool IsLiteral>
  203. inline void Parser<P, IsLiteral>::ECRevert(CharCount n /*= 1*/)
  204. {
  205. Assert(n + input <= next);
  206. next -= n;
  207. }
  208. //
  209. // Helpers
  210. //
  211. template <typename P, const bool IsLiteral>
  212. int Parser<P, IsLiteral>::TryParseExtendedUnicodeEscape(Char& c, bool& previousSurrogatePart, bool trackSurrogatePair /* = false */)
  213. {
  214. if (!scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  215. {
  216. return 0;
  217. }
  218. if (!ECCanConsume(2) || ECLookahead(0) !='{' || !standardEncodedChars->IsHex(ECLookahead(1)))
  219. {
  220. return 0;
  221. }
  222. // The first character is mandatory to consume escape sequence, so we check for it above, at this stage we can set it as we already checked.
  223. codepoint_t codePoint = standardEncodedChars->DigitValue(ECLookahead(1));
  224. int i = 2;
  225. while(ECCanConsume(i + 1) && standardEncodedChars->IsHex(ECLookahead(i)))
  226. {
  227. codePoint <<= 4;
  228. codePoint += standardEncodedChars->DigitValue(ECLookahead(i));
  229. if (codePoint > 0x10FFFF)
  230. {
  231. DeferredFailIfUnicode(JSERR_RegExpInvalidEscape);
  232. return 0;
  233. }
  234. i++;
  235. }
  236. if(!ECCanConsume(i + 1) || ECLookahead(i) != '}')
  237. {
  238. return 0;
  239. }
  240. uint consumptionNumber = i + 1;
  241. Assert(consumptionNumber >= 3);
  242. if (!previousSurrogatePart && trackSurrogatePair && this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  243. {
  244. // Current location
  245. TrackIfSurrogatePair(codePoint, (next - 1), consumptionNumber + 1);
  246. }
  247. char16 other;
  248. // Generally if this code point is a single character, then we take it and return.
  249. // If the character is made up of two characters then we emit the first and backtrack to the start of th escape sequence;
  250. // Following that we check if we have already seen the first character, and if so emit the second and consume the entire escape sequence.
  251. if (codePoint < 0x10000)
  252. {
  253. c = UTC(codePoint);
  254. ECConsumeMultiUnit(consumptionNumber);
  255. }
  256. else if (previousSurrogatePart)
  257. {
  258. previousSurrogatePart = false;
  259. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &other, &c);
  260. ECConsumeMultiUnit(consumptionNumber);
  261. }
  262. else
  263. {
  264. previousSurrogatePart = true;
  265. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &c, &other);
  266. Assert(ECLookback(1) == 'u' && ECLookback(2) == '\\');
  267. ECRevert(2);
  268. }
  269. return consumptionNumber;
  270. }
  271. // This function has the following 'knowledge':
  272. // - A codepoint that might be part of a surrogate pair or part of one
  273. // - A location where that codepoint is located
  274. // - Previously tracked part of surrogate pair (tempLocationOfSurrogatePair, and codePointAtTempLocation), which is always a surrogate lower part.
  275. // - A pointer to the current location of the linked list
  276. // - If a previous location is tracked, then it is of a parsed character (not given character) before current, and not same to current
  277. // This can't be asserted directly, and has to be followed by callers. Term pass can reset with each iteration, as well as this method in cases it needs.
  278. template <typename P, const bool IsLiteral>
  279. void Parser<P, IsLiteral>:: TrackIfSurrogatePair(codepoint_t codePoint, const EncodedChar* location, uint32 consumptionLength)
  280. {
  281. Assert(codePoint < 0x110000);
  282. Assert(location != nullptr);
  283. Assert(location != this->tempLocationOfSurrogatePair);
  284. if (Js::NumberUtilities::IsSurrogateLowerPart(codePoint))
  285. {
  286. this->tempLocationOfSurrogatePair = location;
  287. this->codePointAtTempLocation = codePoint;
  288. }
  289. else
  290. {
  291. if(Js::NumberUtilities::IsSurrogateUpperPart(codePoint) && this->tempLocationOfSurrogatePair != nullptr)
  292. {
  293. Assert(Js::NumberUtilities::IsSurrogateLowerPart(codePointAtTempLocation));
  294. consumptionLength = (uint32)(location - this->tempLocationOfSurrogatePair) + consumptionLength;
  295. codePoint = Js::NumberUtilities::SurrogatePairAsCodePoint(codePointAtTempLocation, codePoint);
  296. location = this->tempLocationOfSurrogatePair;
  297. }
  298. // At this point we can clear previous location, and then if codePoint is bigger than 0xFFFF store it, as we either received it or combined it above
  299. this->tempLocationOfSurrogatePair = nullptr;
  300. this->codePointAtTempLocation = 0;
  301. }
  302. if (codePoint > 0xFFFF)
  303. {
  304. this->positionAfterLastSurrogate = location + consumptionLength;
  305. this->valueOfLastSurrogate = codePoint;
  306. // When parsing without AST we aren't given an allocator. In addition, only the 2 lines above are used during Pass 0;
  307. // while the bottom is used during Pass 1 (which isn't done when ParseNoAST)
  308. if(this->ctAllocator != nullptr)
  309. {
  310. SurrogatePairTracker* node = Anew(this->ctAllocator, SurrogatePairTracker, location, this->tempLocationOfRange, codePoint, consumptionLength, this->m_cMultiUnits);
  311. if (surrogatePairList == nullptr)
  312. {
  313. Assert(currentSurrogatePairNode == nullptr);
  314. surrogatePairList = node;
  315. currentSurrogatePairNode = node;
  316. }
  317. else
  318. {
  319. Assert(currentSurrogatePairNode != nullptr);
  320. currentSurrogatePairNode->next = node;
  321. currentSurrogatePairNode = node;
  322. }
  323. }
  324. }
  325. }
  326. template <typename P, const bool IsLiteral>
  327. Node* Parser<P, IsLiteral>::CreateSurrogatePairAtom(char16 lower, char16 upper)
  328. {
  329. MatchLiteralNode * literalNode = Anew(this->ctAllocator, MatchLiteralNode, 0, 0);
  330. MatchCharNode lowerNode(lower);
  331. MatchCharNode upperNode(upper);
  332. AccumLiteral(literalNode, &lowerNode);
  333. AccumLiteral(literalNode, &upperNode);
  334. return literalNode;
  335. }
  336. // This function will create appropriate pairs of ranges and add them to the disjunction node.
  337. // Terms used in comments:
  338. // - A minor codePoint is smaller than major codePoint, and both define a range of codePoints above 0x10000; to avoid confusion between lower/upper denoting codeUnits composing the surrogate pair.
  339. // - A boundary is a mod 0x400 alignment marker due to the nature of surrogate pairs representation. So the codepoint 0x10300 lies between boundaries 0x10000 and 0x10400.
  340. // - A prefix is the range set used to represent the values from minorCodePoint to the first boundary above minorCodePoint if applicable.
  341. // - A suffix is the range set used to represent the values from first boundary below majorCodePoint to the majorCodePoint if applicable.
  342. // - A full range is the range set used to represent the values from first boundary above minorCodePoint to first boundary below majorCodePoint if applicable.
  343. // The algorithm works as follows:
  344. // 1. Determine minorBoundary (minorCodePoint - mod 0x400 +0x400) and majorBoundary (majorCodePoint - mod 0x400). minorBoundary > minorCodePoint and majorBoundary < majorCodePoint
  345. // 2. Based on the codePoints and the boundaries, prefix, suffix, and full range is determined. Here are the rules:
  346. // 2-a. If minorBoundary > majorBoundary, we have an inner boundary range, output just that.
  347. // 2-b. If minorBoundary - 0x400u != minorCodepoint (i.e. codePoint doesn't lie right on a boundary to be part of a full range) we have a prefix.
  348. // 2-c. If majorBoundary + 0x3FFu != majorCodepoint (i.e. codePoint doesn't lie right before a boundary to be part of a full range) we have a suffix.
  349. // 2-d. We have a full range, if the two boundaries don't equal, OR the codePoints lie on the range boundaries opposite to what constitutes a prefix/suffix.
  350. // Visual representation for sample range 0x10300 - 0x10900
  351. // | [ _ | ^ ] |
  352. // 0x10000 0x10800 0x11000
  353. // [ ] - denote the actual range
  354. // _ - minorBoundary
  355. // ^ - majorBoundary
  356. // | - other boundaries
  357. // prefix is between [ and _
  358. // suffix is between ^ and ]
  359. // full range is between _ and ^
  360. template <typename P, const bool IsLiteral>
  361. AltNode* Parser<P, IsLiteral>::AppendSurrogateRangeToDisjunction(codepoint_t minorCodePoint, codepoint_t majorCodePoint, AltNode *lastAltNode)
  362. {
  363. Assert(minorCodePoint < majorCodePoint);
  364. Assert(minorCodePoint >= 0x10000u);
  365. Assert(majorCodePoint >= 0x10000u);
  366. Assert(scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() && unicodeFlagPresent);
  367. char16 lowerMinorCodeUnit, upperMinorCodeUnit, lowerMajorCodeUnit, upperMajorCodeUnit;
  368. Js::NumberUtilities::CodePointAsSurrogatePair(minorCodePoint, &lowerMinorCodeUnit, &upperMinorCodeUnit);
  369. Js::NumberUtilities::CodePointAsSurrogatePair(majorCodePoint, &lowerMajorCodeUnit, &upperMajorCodeUnit);
  370. // These boundaries represent whole range boundaries, as in 0x10000, 0x10400, 0x10800 etc
  371. // minor boundary is the first boundary strictly above minorCodePoint
  372. // major boundary is the first boundary below or equal to majorCodePoint
  373. codepoint_t minorBoundary = minorCodePoint - (minorCodePoint % 0x400u) + 0x400u;
  374. codepoint_t majorBoundary = majorCodePoint - (majorCodePoint % 0x400u);
  375. Assert(minorBoundary >= 0x10000);
  376. Assert(majorBoundary >= 0x10000);
  377. AltNode* tailToAdd = nullptr;
  378. // If the minor boundary is higher than major boundary, that means we have a range within the boundary and is less than 0x400
  379. // Ex: 0x10430 - 0x10700 will have minor boundary of 0x10800 and major of 0x10400
  380. // This pair will be represented in single range set.
  381. const bool singleRange = minorBoundary > majorBoundary;
  382. if (singleRange)
  383. {
  384. Assert(majorCodePoint - minorCodePoint < 0x400u);
  385. Assert(lowerMinorCodeUnit == lowerMajorCodeUnit);
  386. MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, lowerMinorCodeUnit);
  387. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
  388. setNode->set.SetRange(ctAllocator, (Char)upperMinorCodeUnit, (Char)upperMajorCodeUnit);
  389. ConcatNode* concatNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, setNode, nullptr));
  390. tailToAdd = Anew(ctAllocator, AltNode, concatNode, nullptr);
  391. }
  392. else
  393. {
  394. Node* prefixNode = nullptr, *suffixNode = nullptr;
  395. const bool twoConsecutiveRanges = minorBoundary == majorBoundary;
  396. // For minorBoundary,
  397. if (minorBoundary - minorCodePoint == 1) // Single character in minor range
  398. {
  399. // The prefix is only a surrogate pair atom
  400. prefixNode = CreateSurrogatePairAtom(lowerMinorCodeUnit, upperMinorCodeUnit);
  401. }
  402. else if (minorCodePoint != minorBoundary - 0x400u) // Minor range isn't full
  403. {
  404. Assert(minorBoundary - minorCodePoint < 0x400u);
  405. MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, (Char)lowerMinorCodeUnit);
  406. MatchSetNode* upperSetNode = Anew(ctAllocator, MatchSetNode, false);
  407. upperSetNode->set.SetRange(ctAllocator, (Char)upperMinorCodeUnit, (Char)0xDFFFu);
  408. prefixNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, upperSetNode, nullptr));
  409. }
  410. else // Full minor range
  411. {
  412. minorBoundary -= 0x400u;
  413. }
  414. if (majorBoundary == majorCodePoint) // Single character in major range
  415. {
  416. // The suffix is only a surrogate pair atom
  417. suffixNode = CreateSurrogatePairAtom(lowerMajorCodeUnit, upperMajorCodeUnit);
  418. majorBoundary -= 0x400u;
  419. }
  420. else if (majorBoundary + 0x3FFu != majorCodePoint) // Major range isn't full
  421. {
  422. Assert(majorCodePoint - majorBoundary < 0x3FFu);
  423. MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, (Char)lowerMajorCodeUnit);
  424. MatchSetNode* upperSetNode = Anew(ctAllocator, MatchSetNode, false, false);
  425. upperSetNode->set.SetRange(ctAllocator, (Char)0xDC00u, (Char)upperMajorCodeUnit);
  426. suffixNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, upperSetNode, nullptr));
  427. majorBoundary -= 0x400u;
  428. }
  429. const bool nonFullConsecutiveRanges = twoConsecutiveRanges && prefixNode != nullptr && suffixNode != nullptr;
  430. if (nonFullConsecutiveRanges)
  431. {
  432. Assert(suffixNode != nullptr);
  433. Assert(minorCodePoint != minorBoundary - 0x400u);
  434. Assert(majorBoundary + 0x3FFu != majorCodePoint);
  435. // If the minor boundary is equal to major boundary, that means we have a cross boundary range that only needs 2 nodes for prefix/suffix.
  436. // We can only cross one boundary.
  437. Assert(majorCodePoint - minorCodePoint < 0x800u);
  438. tailToAdd = Anew(ctAllocator, AltNode, prefixNode, Anew(ctAllocator, AltNode, suffixNode, nullptr));
  439. }
  440. else
  441. {
  442. // We have 3 sets of ranges, comprising of prefix, full and suffix.
  443. Assert(majorCodePoint - minorCodePoint >= 0x400u);
  444. Assert((prefixNode != nullptr && suffixNode != nullptr) // Spanning more than two ranges
  445. || (prefixNode == nullptr && minorBoundary == minorCodePoint) // Two consecutive ranges and the minor is full
  446. || (suffixNode == nullptr && majorBoundary + 0x3FFu == majorCodePoint)); // Two consecutive ranges and the major is full
  447. Node* lowerOfFullRange;
  448. char16 lowerMinorBoundary, lowerMajorBoundary, ignore;
  449. Js::NumberUtilities::CodePointAsSurrogatePair(minorBoundary, &lowerMinorBoundary, &ignore);
  450. bool singleFullRange = majorBoundary == minorBoundary;
  451. if (singleFullRange)
  452. {
  453. // The lower part of the full range is simple a surrogate lower char
  454. lowerOfFullRange = Anew(ctAllocator, MatchCharNode, (Char)lowerMinorBoundary);
  455. }
  456. else
  457. {
  458. Js::NumberUtilities::CodePointAsSurrogatePair(majorBoundary, &lowerMajorBoundary, &ignore);
  459. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
  460. setNode->set.SetRange(ctAllocator, (Char)lowerMinorBoundary, (Char)lowerMajorBoundary);
  461. lowerOfFullRange = setNode;
  462. }
  463. MatchSetNode* fullUpperRange = Anew(ctAllocator, MatchSetNode, false, false);
  464. fullUpperRange->set.SetRange(ctAllocator, (Char)0xDC00u, (Char)0xDFFFu);
  465. // These are added in the following order [full] [prefix][suffix]
  466. // This is doing by prepending, so in reverse.
  467. if (suffixNode != nullptr)
  468. {
  469. tailToAdd = Anew(ctAllocator, AltNode, suffixNode, tailToAdd);
  470. }
  471. if (prefixNode != nullptr)
  472. {
  473. tailToAdd = Anew(ctAllocator, AltNode, prefixNode, tailToAdd);
  474. }
  475. tailToAdd = Anew(ctAllocator, AltNode, Anew(ctAllocator, ConcatNode, lowerOfFullRange, Anew(ctAllocator, ConcatNode, fullUpperRange, nullptr)), tailToAdd);
  476. }
  477. }
  478. if (lastAltNode != nullptr)
  479. {
  480. Assert(lastAltNode->tail == nullptr);
  481. lastAltNode->tail = tailToAdd;
  482. }
  483. return tailToAdd;
  484. }
  485. template <typename P, const bool IsLiteral>
  486. AltNode* Parser<P, IsLiteral>::AppendSurrogatePairToDisjunction(codepoint_t codePoint, AltNode *lastAltNode)
  487. {
  488. char16 lower, upper;
  489. Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &lower, &upper);
  490. AltNode* tailNode = Anew(ctAllocator, AltNode, CreateSurrogatePairAtom(lower, upper), nullptr);
  491. if (lastAltNode != nullptr)
  492. {
  493. lastAltNode->tail = tailNode;
  494. }
  495. return tailNode;
  496. }
  497. //
  498. // Errors
  499. //
  500. template <typename P, const bool IsLiteral>
  501. void Parser<P, IsLiteral>::Fail(HRESULT error)
  502. {
  503. throw ParseError(inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
  504. }
  505. // This doesn't throw, but stores first error code for throwing later
  506. template <typename P, const bool IsLiteral>
  507. void Parser<P, IsLiteral>::DeferredFailIfUnicode(HRESULT error)
  508. {
  509. Assert(this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  510. if (this->deferredIfUnicodeError == nullptr)
  511. {
  512. this->deferredIfUnicodeError = Anew(ctAllocator, ParseError, inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
  513. }
  514. }
  515. // This doesn't throw, but stores first error code for throwing later
  516. template <typename P, const bool IsLiteral>
  517. void Parser<P, IsLiteral>::DeferredFailIfNotUnicode(HRESULT error)
  518. {
  519. Assert(this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  520. if (this->deferredIfNotUnicodeError == nullptr)
  521. {
  522. this->deferredIfNotUnicodeError = Anew(ctAllocator, ParseError, inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
  523. }
  524. }
  525. template <typename P, const bool IsLiteral>
  526. inline void Parser<P, IsLiteral>::ECMust(EncodedChar ec, HRESULT error)
  527. {
  528. // We never look for 0
  529. Assert(ec != 0);
  530. if (ECLookahead() != ec)
  531. Fail(error);
  532. ECConsume();
  533. }
  534. template <typename P, const bool IsLiteral>
  535. inline char16 Parser<P, IsLiteral>::NextChar()
  536. {
  537. Assert(!IsEOF());
  538. // Could be an embedded 0
  539. Char c = this->template ReadFull<true>(next, inputLim);
  540. // No embedded newlines in literals
  541. if (IsLiteral && standardChars->IsNewline(c))
  542. Fail(ERRnoSlash);
  543. return c;
  544. }
  545. //
  546. // Patterns/Disjunctions/Alternatives
  547. //
  548. template <typename P, const bool IsLiteral>
  549. void Parser<P, IsLiteral>::PatternPass0()
  550. {
  551. this->positionAfterLastSurrogate = nullptr;
  552. this->deferredIfNotUnicodeError = nullptr;
  553. this->deferredIfUnicodeError = nullptr;
  554. DisjunctionPass0(0);
  555. }
  556. template <typename P, const bool IsLiteral>
  557. Node* Parser<P, IsLiteral>::PatternPass1()
  558. {
  559. return DisjunctionPass1();
  560. }
  561. template <typename P, const bool IsLiteral>
  562. Node* Parser<P, IsLiteral>::UnionNodes(Node* prev, Node* curr)
  563. {
  564. if (prev->tag == Node::MatchChar)
  565. {
  566. MatchCharNode* prevChar = (MatchCharNode*)prev;
  567. if (curr->tag == Node::MatchChar)
  568. {
  569. MatchCharNode* currChar = (MatchCharNode*)curr;
  570. if (prevChar->cs[0] == currChar->cs[0])
  571. // Just ignore current node
  572. return prevChar;
  573. else
  574. {
  575. // Union chars into new set
  576. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false);
  577. setNode->set.Set(ctAllocator, prevChar->cs[0]);
  578. setNode->set.Set(ctAllocator, currChar->cs[0]);
  579. return setNode;
  580. }
  581. }
  582. else if (curr->tag == Node::MatchSet)
  583. {
  584. MatchSetNode* currSet = (MatchSetNode*)curr;
  585. if (currSet->isNegation)
  586. // Can't merge
  587. return 0;
  588. else
  589. {
  590. // Union chars into new set
  591. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false);
  592. setNode->set.Set(ctAllocator, prevChar->cs[0]) ;
  593. setNode->set.UnionInPlace(ctAllocator, currSet->set);
  594. return setNode;
  595. }
  596. }
  597. else
  598. // Can't merge
  599. return 0;
  600. }
  601. else if (prev->tag == Node::MatchSet)
  602. {
  603. MatchSetNode* prevSet = (MatchSetNode*)prev;
  604. if (prevSet->isNegation)
  605. // Can't merge
  606. return 0;
  607. else if (curr->tag == Node::MatchChar)
  608. {
  609. MatchCharNode* currChar = (MatchCharNode*)curr;
  610. // Include char in prev set
  611. prevSet->set.Set(ctAllocator, currChar->cs[0]);
  612. return prevSet;
  613. }
  614. else if (curr->tag == Node::MatchSet)
  615. {
  616. MatchSetNode* currSet = (MatchSetNode*)curr;
  617. if (currSet->isNegation)
  618. // Can't merge
  619. return 0;
  620. else
  621. {
  622. // Include chars in prev set
  623. prevSet->set.UnionInPlace(ctAllocator, currSet->set);
  624. return prevSet;
  625. }
  626. }
  627. else
  628. // Can't merge
  629. return 0;
  630. }
  631. else
  632. // Can't merge
  633. return 0;
  634. }
  635. template <typename P, const bool IsLiteral>
  636. void Parser<P, IsLiteral>::DisjunctionPass0(int depth)
  637. {
  638. AlternativePass0(depth);
  639. while (true)
  640. {
  641. // Could be terminating 0
  642. if (ECLookahead() != '|')
  643. return;
  644. ECConsume();
  645. AlternativePass0(depth);
  646. }
  647. }
  648. template <typename P, const bool IsLiteral>
  649. Node* Parser<P, IsLiteral>::DisjunctionPass1()
  650. {
  651. // Maintain the invariants:
  652. // - alt lists have two or more items
  653. // - alt list items are never alt lists (so we must inline them)
  654. // - an alt list never contains two consecutive match-character/match-set nodes
  655. // (so we must union consecutive items into a single set)
  656. Node* node = AlternativePass1();
  657. AltNode* last = 0;
  658. // First node may be an alternative
  659. if (node->tag == Node::Alt)
  660. {
  661. last = (AltNode*)node;
  662. while (last->tail != 0)
  663. last = last->tail;
  664. }
  665. while (true)
  666. {
  667. // Could be terminating 0
  668. if (ECLookahead() != '|')
  669. return node;
  670. ECConsume(); // '|'
  671. Node* next = AlternativePass1();
  672. AnalysisAssert(next != nullptr);
  673. Node* revisedPrev = UnionNodes(last == 0 ? node : last->head, next);
  674. if (revisedPrev != 0)
  675. {
  676. // Can merge next into previously seen alternative
  677. if (last == 0)
  678. node = revisedPrev;
  679. else
  680. last->head = revisedPrev;
  681. }
  682. else if (next->tag == Node::Alt)
  683. {
  684. AltNode* nextList = (AltNode*)next;
  685. // Since we just had to dereference next to get here, we know that nextList
  686. // can't be nullptr in this case.
  687. AnalysisAssert(nextList != nullptr);
  688. // Append inner list to current list
  689. revisedPrev = UnionNodes(last == 0 ? node : last->head, nextList->head);
  690. if (revisedPrev != 0)
  691. {
  692. // Can merge head of list into previously seen alternative
  693. if (last ==0)
  694. node = revisedPrev;
  695. else
  696. last->head = revisedPrev;
  697. nextList = nextList->tail;
  698. }
  699. if (last == 0)
  700. node = Anew(ctAllocator, AltNode, node, nextList);
  701. else
  702. last->tail = nextList;
  703. AnalysisAssert(nextList != nullptr);
  704. while (nextList->tail != 0)
  705. nextList = nextList->tail;
  706. last = nextList;
  707. }
  708. else
  709. {
  710. // Append node
  711. AltNode* cons = Anew(ctAllocator, AltNode, next, 0);
  712. if (last == 0)
  713. node = Anew(ctAllocator, AltNode, node, cons);
  714. else
  715. last->tail = cons;
  716. last = cons;
  717. }
  718. }
  719. }
  720. template <typename P, const bool IsLiteral>
  721. inline bool Parser<P, IsLiteral>::IsEndOfAlternative()
  722. {
  723. EncodedChar ec = ECLookahead();
  724. // Could be terminating 0, but embedded 0 is part of alternative
  725. return (ec == 0 && IsEOF()) || ec == ')' || ec == '|' || (IsLiteral && ec == '/');
  726. }
  727. template <typename P, const bool IsLiteral>
  728. void Parser<P, IsLiteral>::EnsureLitbuf(CharCount size)
  729. {
  730. if (litbufLen - litbufNext < size)
  731. {
  732. CharCount newLen = max(litbufLen, initLitbufSize);
  733. while (newLen < litbufNext + size)
  734. newLen *= 2;
  735. litbuf = (Char*)ctAllocator->Realloc(litbuf, litbufLen * sizeof(Char), newLen * sizeof(Char));
  736. litbufLen = newLen;
  737. }
  738. }
  739. template <typename P, const bool IsLiteral>
  740. void Parser<P, IsLiteral>::AccumLiteral(MatchLiteralNode* deferredLiteralNode, Node* charOrLiteralNode)
  741. {
  742. Assert(charOrLiteralNode->tag == Node::MatchChar || charOrLiteralNode->tag == Node::MatchLiteral);
  743. CharCount addLen = charOrLiteralNode->LiteralLength();
  744. Assert(addLen > 0);
  745. if (deferredLiteralNode->length == 0)
  746. {
  747. // Start a new literal
  748. EnsureLitbuf(addLen);
  749. deferredLiteralNode->offset = litbufNext;
  750. deferredLiteralNode->length = addLen;
  751. charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
  752. }
  753. else if (deferredLiteralNode->offset + deferredLiteralNode->length == litbufNext)
  754. {
  755. // Keep growing the current literal
  756. EnsureLitbuf(addLen);
  757. charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
  758. deferredLiteralNode->length += addLen;
  759. }
  760. else if (charOrLiteralNode->tag == Node::MatchLiteral && deferredLiteralNode->offset + deferredLiteralNode->length == ((MatchLiteralNode*)charOrLiteralNode)->offset)
  761. {
  762. // Absorb next literal into current literal since they are adjacent
  763. deferredLiteralNode->length += addLen;
  764. }
  765. else
  766. {
  767. // Abandon current literal and start a fresh one (leaves gap)
  768. EnsureLitbuf(deferredLiteralNode->length + addLen);
  769. js_wmemcpy_s(litbuf + litbufNext, litbufLen - litbufNext, litbuf + deferredLiteralNode->offset, deferredLiteralNode->length);
  770. deferredLiteralNode->offset = litbufNext;
  771. litbufNext += deferredLiteralNode->length;
  772. charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
  773. deferredLiteralNode->length += addLen;
  774. }
  775. }
  776. template <typename P, const bool IsLiteral>
  777. Node* Parser<P, IsLiteral>::FinalTerm(Node* node, MatchLiteralNode* deferredLiteralNode)
  778. {
  779. if (node == deferredLiteralNode)
  780. {
  781. #if DBG
  782. if (deferredLiteralNode->length == 0)
  783. Assert(false);
  784. #endif
  785. Assert(deferredLiteralNode->offset < litbufNext);
  786. Assert(deferredLiteralNode->offset + deferredLiteralNode->length <= litbufNext);
  787. if (deferredLiteralNode->length == 1)
  788. {
  789. node = Anew(ctAllocator, MatchCharNode, litbuf[deferredLiteralNode->offset]);
  790. if (deferredLiteralNode->offset + deferredLiteralNode->length == litbufNext)
  791. // Reclaim last added character
  792. litbufNext--;
  793. // else: leave a gap in the literal buffer
  794. }
  795. else
  796. node = Anew(ctAllocator, MatchLiteralNode, *deferredLiteralNode);
  797. deferredLiteralNode->offset = 0;
  798. deferredLiteralNode->length = 0;
  799. }
  800. return node;
  801. }
  802. template <typename P, const bool IsLiteral>
  803. void Parser<P, IsLiteral>::AlternativePass0(int depth)
  804. {
  805. while (!IsEndOfAlternative())
  806. TermPass0(depth);
  807. }
  808. template <typename P, const bool IsLiteral>
  809. Node* Parser<P, IsLiteral>::AlternativePass1()
  810. {
  811. if (IsEndOfAlternative())
  812. return Anew(ctAllocator, SimpleNode, Node::Empty);
  813. MatchCharNode deferredCharNode(0);
  814. MatchLiteralNode deferredLiteralNode(0, 0);
  815. // Maintain the invariants:
  816. // - concat lists have two or more items
  817. // - concat list items are never concat lists
  818. // - a concat list never contains two consecutive match-character/match-literal nodes
  819. bool previousSurrogatePart = false;
  820. Node* node = TermPass1(&deferredCharNode, previousSurrogatePart);
  821. AnalysisAssert(node != nullptr);
  822. ConcatNode* last = 0;
  823. // First node may be a concat
  824. if (node->tag == Node::Concat)
  825. {
  826. last = (ConcatNode*)node;
  827. while (last->tail != 0)
  828. last = last->tail;
  829. }
  830. if (last == 0)
  831. {
  832. if (node->LiteralLength() > 0)
  833. {
  834. // Begin a new literal
  835. AccumLiteral(&deferredLiteralNode, node);
  836. node = &deferredLiteralNode;
  837. }
  838. }
  839. else
  840. {
  841. if (last->head->LiteralLength() > 0)
  842. {
  843. // Begin a new literal
  844. AccumLiteral(&deferredLiteralNode, last->head);
  845. last->head = &deferredLiteralNode;
  846. }
  847. }
  848. while (!IsEndOfAlternative())
  849. {
  850. Node* next = TermPass1(&deferredCharNode, previousSurrogatePart);
  851. AnalysisAssert(next != nullptr);
  852. if (next->LiteralLength() > 0)
  853. {
  854. // Begin a new literal or grow the existing literal
  855. AccumLiteral(&deferredLiteralNode, next);
  856. if (last == 0)
  857. {
  858. if (node != &deferredLiteralNode)
  859. {
  860. // So far we have first item and the current literal
  861. ConcatNode* cons = Anew(ctAllocator, ConcatNode, &deferredLiteralNode, 0);
  862. node = Anew(ctAllocator, ConcatNode, node, cons);
  863. last = cons;
  864. }
  865. // else: keep growing first literal
  866. }
  867. else
  868. {
  869. if (last->head != &deferredLiteralNode)
  870. {
  871. // Append a new literal node
  872. ConcatNode* cons = Anew(ctAllocator, ConcatNode, &deferredLiteralNode, 0);
  873. last->tail = cons;
  874. last = cons;
  875. }
  876. // else: keep growing current literal
  877. }
  878. }
  879. else if (next->tag == Node::Concat)
  880. {
  881. // Append this list to accumulated list
  882. ConcatNode* nextList = (ConcatNode*)next;
  883. if (nextList->head->LiteralLength() > 0 &&
  884. ((last == 0 && node == &deferredLiteralNode) ||
  885. (last != 0 && last->head == &deferredLiteralNode)))
  886. {
  887. // Absorb the next character or literal into the current literal
  888. // (may leave a gab in litbuf)
  889. AccumLiteral(&deferredLiteralNode, nextList->head);
  890. nextList = nextList->tail;
  891. // List has at least two items
  892. AnalysisAssert(nextList != 0);
  893. // List should be in canonical form, so no consecutive chars/literals
  894. Assert(nextList->head->LiteralLength() == 0);
  895. }
  896. if (last == 0)
  897. node = Anew(ctAllocator, ConcatNode, FinalTerm(node, &deferredLiteralNode), nextList);
  898. else
  899. {
  900. last->head = FinalTerm(last->head, &deferredLiteralNode);
  901. last->tail = nextList;
  902. }
  903. // NextList can't be nullptr, since it was last set as next, which
  904. // was dereferenced, or it was set on a path that already has this
  905. // analysis assert.
  906. AnalysisAssert(nextList != nullptr);
  907. while (nextList->tail != 0)
  908. nextList = nextList->tail;
  909. last = nextList;
  910. // No outstanding literals
  911. Assert(deferredLiteralNode.length == 0);
  912. // We just set this from nextList, which we know is not nullptr.
  913. AnalysisAssert(last != nullptr);
  914. if (last->head->LiteralLength() > 0)
  915. {
  916. // If the list ends with a literal, transfer it into deferredLiteralNode
  917. // so we can continue accumulating (won't leave a gab in litbuf)
  918. AccumLiteral(&deferredLiteralNode, last->head);
  919. // Can discard MatchLiteralNode since it lives in compile-time allocator
  920. last->head = &deferredLiteralNode;
  921. }
  922. }
  923. else
  924. {
  925. // Append this node to accumulated list
  926. ConcatNode* cons = Anew(ctAllocator, ConcatNode, next, 0);
  927. if (last == 0)
  928. node = Anew(ctAllocator, ConcatNode, FinalTerm(node, &deferredLiteralNode), cons);
  929. else
  930. {
  931. last->head = FinalTerm(last->head, &deferredLiteralNode);
  932. last->tail = cons;
  933. }
  934. last = cons;
  935. // No outstanding literals
  936. Assert(deferredLiteralNode.length == 0);
  937. }
  938. }
  939. if (last == 0)
  940. node = FinalTerm(node, &deferredLiteralNode);
  941. else
  942. last->head = FinalTerm(last->head, &deferredLiteralNode);
  943. // No outstanding literals
  944. Assert(deferredLiteralNode.length == 0);
  945. return node;
  946. }
  947. //
  948. // Terms
  949. //
  950. template <typename P, const bool IsLiteral>
  951. Node* Parser<P, IsLiteral>::NewLoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body)
  952. {
  953. //
  954. // NOTE: We'd like to represent r? (i.e. r{0,1}) as r|<empty> since the loop representation has high overhead.
  955. // HOWEVER if r contains a group definition and could match empty then we must execute as a loop
  956. // so that group bindings are correctly reset on no progress (e.g.: /(a*)?/.exec("")). Thus we defer
  957. // this optimization until pass 4 of the optimizer, at which point we know whether r could match empty.
  958. //
  959. if (lower == 1 && upper == 1)
  960. return body;
  961. else if (lower == 0 && upper == 0)
  962. // Loop is equivalent to empty. If the loop body contains group definitions they will have already been
  963. // counted towards the overall number of groups. The matcher will initialize their contents to
  964. // undefined, and since the loop body would never execute the inner groups could never be updated from
  965. // undefined.
  966. return Anew(ctAllocator, SimpleNode, Node::Empty);
  967. else
  968. return Anew(ctAllocator, LoopNode, lower, upper, isGreedy, body);
  969. }
  970. template <typename P, const bool IsLiteral>
  971. bool Parser<P, IsLiteral>::AtQuantifier()
  972. {
  973. // Could be terminating 0
  974. switch (ECLookahead())
  975. {
  976. case '*':
  977. case '+':
  978. case '?':
  979. return true;
  980. case '{':
  981. {
  982. CharCount lookahead = 1;
  983. while (ECCanConsume(lookahead + 1) && standardEncodedChars->IsDigit(ECLookahead(lookahead)))
  984. lookahead++;
  985. if (lookahead == 1 || !ECCanConsume(lookahead + 1))
  986. return false;
  987. switch (ECLookahead(lookahead))
  988. {
  989. case ',':
  990. lookahead++;
  991. if (ECCanConsume(lookahead + 1) && ECLookahead(lookahead) == '}')
  992. return true;
  993. else
  994. {
  995. CharCount saved = lookahead;
  996. while (ECCanConsume(lookahead + 1) && standardEncodedChars->IsDigit(ECLookahead(lookahead)))
  997. lookahead++;
  998. if (lookahead == saved)
  999. return false;
  1000. return ECCanConsume(lookahead + 1) && ECLookahead(lookahead) == '}';
  1001. }
  1002. case '}':
  1003. return true;
  1004. default:
  1005. return false;
  1006. }
  1007. }
  1008. default:
  1009. return false;
  1010. }
  1011. }
  1012. template <typename P, const bool IsLiteral>
  1013. bool Parser<P, IsLiteral>::OptNonGreedy()
  1014. {
  1015. // Could be terminating 0
  1016. if (ECLookahead() != '?')
  1017. return true;
  1018. ECConsume();
  1019. return false;
  1020. }
  1021. template <typename P, const bool IsLiteral>
  1022. CharCount Parser<P, IsLiteral>::RepeatCount()
  1023. {
  1024. CharCount n = 0;
  1025. int digits = 0;
  1026. while (true)
  1027. {
  1028. // Could be terminating 0
  1029. EncodedChar ec = ECLookahead();
  1030. if (!standardEncodedChars->IsDigit(ec))
  1031. {
  1032. if (digits == 0)
  1033. {
  1034. Fail(JSERR_RegExpSyntax);
  1035. }
  1036. return n;
  1037. }
  1038. if (n > MaxCharCount / 10)
  1039. {
  1040. break;
  1041. }
  1042. n *= 10;
  1043. if (n > MaxCharCount - standardEncodedChars->DigitValue(ec))
  1044. {
  1045. break;
  1046. }
  1047. n += standardEncodedChars->DigitValue(ec);
  1048. digits++;
  1049. ECConsume();
  1050. }
  1051. Assert(digits != 0); // shouldn't be able to reach here with (digits == 0)
  1052. // The token had a value larger than MaxCharCount so we didn't return the value and reached here instead.
  1053. // Consume the rest of the token and return MaxCharCount.
  1054. while (true)
  1055. {
  1056. EncodedChar ec = ECLookahead();
  1057. if (!standardEncodedChars->IsDigit(ec))
  1058. {
  1059. return MaxCharCount;
  1060. }
  1061. ECConsume();
  1062. }
  1063. }
  1064. template <typename P, const bool IsLiteral>
  1065. void Parser<P, IsLiteral>::TermPass0(int depth)
  1066. {
  1067. PROBE_STACK_NO_DISPOSE(scriptContext, Js::Constants::MinStackRegex);
  1068. // Either we have a location at the start, or the end, never both. As in between it should have been cleared if surrogate pair
  1069. // Or must be cleared if we didn't perform the check
  1070. bool clearLocationIfPresent = this->tempLocationOfSurrogatePair != nullptr;
  1071. switch (ECLookahead())
  1072. {
  1073. case '^':
  1074. case '$':
  1075. ECConsume();
  1076. return;
  1077. case '\\':
  1078. ECConsume();
  1079. if (AtomEscapePass0())
  1080. return;
  1081. break;
  1082. case '(':
  1083. // Can't combine into a single codeunit because of group present
  1084. this->tempLocationOfSurrogatePair = nullptr;
  1085. this->codePointAtTempLocation = 0;
  1086. clearLocationIfPresent = false;
  1087. ECConsume();
  1088. switch (ECLookahead())
  1089. {
  1090. case '?':
  1091. if (!ECCanConsume(2))
  1092. Fail(JSERR_RegExpSyntax);
  1093. switch (ECLookahead(1))
  1094. {
  1095. case '=':
  1096. case '!':
  1097. case ':':
  1098. ECConsume(2);
  1099. break;
  1100. default:
  1101. numGroups++;
  1102. break;
  1103. }
  1104. break;
  1105. default:
  1106. numGroups++;
  1107. break;
  1108. }
  1109. DisjunctionPass0(depth + 1);
  1110. ECMust(')', JSERR_RegExpNoParen);
  1111. break;
  1112. case '.':
  1113. ECConsume();
  1114. break;
  1115. case '[':
  1116. ECConsume();
  1117. this->tempLocationOfSurrogatePair = nullptr;
  1118. this->codePointAtTempLocation = 0;
  1119. this->tempLocationOfRange = next;
  1120. CharacterClassPass0();
  1121. this->tempLocationOfRange = nullptr;
  1122. ECMust(']', JSERR_RegExpNoBracket);
  1123. break;
  1124. case ')':
  1125. case '|':
  1126. Fail(JSERR_RegExpSyntax);
  1127. break;
  1128. case ']':
  1129. case '}':
  1130. NextChar();
  1131. break;
  1132. case '*':
  1133. case '+':
  1134. case '?':
  1135. case '{':
  1136. if (AtQuantifier())
  1137. Fail(JSERR_RegExpBadQuant);
  1138. else
  1139. ECConsume();
  1140. break;
  1141. case 0:
  1142. if (IsEOF())
  1143. // Terminating 0
  1144. Fail(JSERR_RegExpSyntax);
  1145. // else fall-through for embedded 0
  1146. default:
  1147. {
  1148. const EncodedChar* current = next;
  1149. Char c = NextChar();
  1150. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1151. {
  1152. TrackIfSurrogatePair(c, current, (uint32)(next - current));
  1153. }
  1154. // Closing '/' in literals should be caught explicitly
  1155. Assert(!IsLiteral || c != '/');
  1156. }
  1157. break;
  1158. }
  1159. if (clearLocationIfPresent && this->tempLocationOfSurrogatePair != nullptr)
  1160. {
  1161. this->tempLocationOfSurrogatePair = nullptr;
  1162. this->codePointAtTempLocation = 0;
  1163. }
  1164. if (AtQuantifier())
  1165. {
  1166. switch (ECLookahead())
  1167. {
  1168. case '*':
  1169. case '+':
  1170. case '?':
  1171. ECConsume();
  1172. OptNonGreedy();
  1173. break;
  1174. case '{':
  1175. {
  1176. ECConsume();
  1177. CharCount lower = RepeatCount();
  1178. switch (ECLookahead())
  1179. {
  1180. case ',':
  1181. ECConsume();
  1182. if (ECLookahead() == '}')
  1183. {
  1184. ECConsume();
  1185. OptNonGreedy();
  1186. }
  1187. else
  1188. {
  1189. CharCount upper = RepeatCount();
  1190. if (upper < lower)
  1191. Fail(JSERR_RegExpSyntax);
  1192. Assert(ECLookahead() == '}');
  1193. ECConsume();
  1194. OptNonGreedy();
  1195. }
  1196. break;
  1197. case '}':
  1198. ECConsume();
  1199. OptNonGreedy();
  1200. break;
  1201. default:
  1202. Assert(false);
  1203. break;
  1204. }
  1205. break;
  1206. }
  1207. default:
  1208. Assert(false);
  1209. break;
  1210. }
  1211. }
  1212. }
  1213. template <typename P, const bool IsLiteral>
  1214. Node* Parser<P, IsLiteral>::TermPass1(MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
  1215. {
  1216. PROBE_STACK_NO_DISPOSE(scriptContext, Js::Constants::MinStackRegex);
  1217. Node* node = 0;
  1218. bool containsSurrogatePair = false;
  1219. switch (ECLookahead())
  1220. {
  1221. case '^':
  1222. ECConsume();
  1223. return Anew(ctAllocator, SimpleNode, Node::BOL); // No quantifier allowed
  1224. case '$':
  1225. ECConsume();
  1226. return Anew(ctAllocator, SimpleNode, Node::EOL); // No quantifier allowed
  1227. case '\\':
  1228. ECConsume();
  1229. if(SurrogatePairPass1(node, deferredCharNode, previousSurrogatePart))
  1230. {
  1231. break; // For quantifier
  1232. }
  1233. else if (AtomEscapePass1(node, deferredCharNode, previousSurrogatePart))
  1234. {
  1235. return node; // No quantifier allowed
  1236. }
  1237. break; // else: fall-through for opt quantifier
  1238. case '(':
  1239. ECConsume();
  1240. switch (ECLookahead())
  1241. {
  1242. case '?':
  1243. switch (ECLookahead(1))
  1244. {
  1245. case '=':
  1246. ECConsume(2); // ?=
  1247. node = DisjunctionPass1();
  1248. Assert(ECLookahead() == ')');
  1249. ECConsume(); // )
  1250. node = Anew(ctAllocator, AssertionNode, false, node);
  1251. break; // As per Annex B, allow this to be quantifiable
  1252. case '!':
  1253. ECConsume(2); // ?!
  1254. node = DisjunctionPass1();
  1255. Assert(ECLookahead() == ')');
  1256. ECConsume(); // )
  1257. node = Anew(ctAllocator, AssertionNode, true, node);
  1258. break; // As per Annex B, allow this to be quantifiable
  1259. case ':':
  1260. ECConsume(2); // ?:
  1261. node = DisjunctionPass1();
  1262. Assert(ECLookahead() == ')');
  1263. ECConsume(); // )
  1264. break; // fall-through for opt quantifier
  1265. default:
  1266. {
  1267. // ? not yet consumed
  1268. int thisGroupId = nextGroupId++;
  1269. node = DisjunctionPass1();
  1270. Assert(ECLookahead() == ')');
  1271. ECConsume(); // )
  1272. node = Anew(ctAllocator, DefineGroupNode, thisGroupId, node);
  1273. break; // fall-through for opt quantifier
  1274. }
  1275. }
  1276. break;
  1277. default:
  1278. {
  1279. // next char not yet consumed
  1280. int thisGroupId = nextGroupId++;
  1281. node = DisjunctionPass1();
  1282. Assert(ECLookahead() == ')');
  1283. ECConsume(); // )
  1284. node = Anew(ctAllocator, DefineGroupNode, thisGroupId, node);
  1285. break; // fall-through for opt quantifier
  1286. }
  1287. }
  1288. break;
  1289. case '.':
  1290. {
  1291. ECConsume();
  1292. node = GetNodeWithValidCharacterSet('.');
  1293. break; // fall-through for opt quantifier
  1294. }
  1295. case '[':
  1296. ECConsume();
  1297. if (unicodeFlagPresent)
  1298. {
  1299. containsSurrogatePair = this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->rangeLocation == next;
  1300. }
  1301. node = containsSurrogatePair ? CharacterClassPass1<true>() : CharacterClassPass1<false>();
  1302. Assert(ECLookahead() == ']');
  1303. ECConsume(); // ]
  1304. break; // fall-through for opt quantifier
  1305. #if DBG
  1306. case ')':
  1307. case '|':
  1308. Assert(false);
  1309. break;
  1310. #endif
  1311. case ']':
  1312. // SPEC DEVIATION: This should be syntax error, instead accept as itself
  1313. deferredCharNode->cs[0] = NextChar();
  1314. node = deferredCharNode;
  1315. break; // fall-through for opt quantifier
  1316. #if DBG
  1317. case '*':
  1318. case '+':
  1319. case '?':
  1320. AssertMsg(false, "Allowed only in the escaped form. These should be caught by TermPass0.");
  1321. break;
  1322. #endif
  1323. case 0:
  1324. if (IsEOF())
  1325. // Terminating 0
  1326. Fail(JSERR_RegExpSyntax);
  1327. // else fall-through for embedded 0
  1328. default:
  1329. if(SurrogatePairPass1(node, deferredCharNode, previousSurrogatePart))
  1330. {
  1331. break; //For quantifier
  1332. }
  1333. else
  1334. {
  1335. deferredCharNode->cs[0] = NextChar();
  1336. node = deferredCharNode;
  1337. break; // fall-through for opt quantifier
  1338. }
  1339. }
  1340. Assert(node != 0);
  1341. if (AtQuantifier())
  1342. {
  1343. switch (ECLookahead())
  1344. {
  1345. case '*':
  1346. if (node == deferredCharNode)
  1347. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1348. ECConsume();
  1349. return NewLoopNode(0, CharCountFlag, OptNonGreedy(), node);
  1350. case '+':
  1351. if (node == deferredCharNode)
  1352. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1353. ECConsume();
  1354. return NewLoopNode(1, CharCountFlag, OptNonGreedy(), node);
  1355. case '?':
  1356. if (node == deferredCharNode)
  1357. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1358. ECConsume();
  1359. return NewLoopNode(0, 1, OptNonGreedy(), node);
  1360. case '{':
  1361. {
  1362. if (node == deferredCharNode)
  1363. node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
  1364. ECConsume();
  1365. CharCount lower = RepeatCount();
  1366. switch (ECLookahead())
  1367. {
  1368. case ',':
  1369. ECConsume();
  1370. if (ECLookahead() == '}')
  1371. {
  1372. ECConsume();
  1373. return NewLoopNode(lower, CharCountFlag, OptNonGreedy(), node);
  1374. }
  1375. else
  1376. {
  1377. CharCount upper = RepeatCount();
  1378. Assert(lower <= upper);
  1379. Assert(ECLookahead() == '}');
  1380. ECConsume(); // }
  1381. return NewLoopNode(lower, upper, OptNonGreedy(), node);
  1382. }
  1383. case '}':
  1384. ECConsume();
  1385. return NewLoopNode(lower, lower, OptNonGreedy(), node);
  1386. default:
  1387. Assert(false);
  1388. break;
  1389. }
  1390. break;
  1391. }
  1392. default:
  1393. Assert(false);
  1394. break;
  1395. }
  1396. }
  1397. return node;
  1398. }
  1399. #pragma warning(push)
  1400. #pragma warning(disable:4702) // unreachable code
  1401. template <typename P, const bool IsLiteral>
  1402. bool Parser<P, IsLiteral>::AtomEscapePass0()
  1403. {
  1404. EncodedChar ec = ECLookahead();
  1405. if (ec == 0 && IsEOF())
  1406. {
  1407. // Terminating 0
  1408. Fail(JSERR_RegExpSyntax);
  1409. return false;
  1410. }
  1411. else if (standardEncodedChars->IsDigit(ec))
  1412. {
  1413. do
  1414. {
  1415. ECConsume();
  1416. }
  1417. while (standardEncodedChars->IsDigit(ECLookahead())); // terminating 0 is not a digit
  1418. return false;
  1419. }
  1420. else if (ECLookahead() == 'c')
  1421. {
  1422. if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
  1423. ECConsume(2);
  1424. return false;
  1425. }
  1426. else
  1427. {
  1428. const EncodedChar *current = next;
  1429. // An escaped '/' is ok
  1430. Char c = NextChar();
  1431. switch (c)
  1432. {
  1433. case 'b':
  1434. case 'B':
  1435. return true;
  1436. // case 'c': handled as special case above
  1437. case 'x':
  1438. if (ECCanConsume(2) &&
  1439. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1440. standardEncodedChars->IsHex(ECLookahead(1)))
  1441. ECConsume(2);
  1442. break;
  1443. case 'u':
  1444. bool surrogateEncountered = false;
  1445. int lengthOfSurrogate = TryParseExtendedUnicodeEscape(c, surrogateEncountered, true);
  1446. if (lengthOfSurrogate > 0)
  1447. {
  1448. if (surrogateEncountered)
  1449. {
  1450. // If we don't have an allocator, we don't create nodes
  1451. // Asserts in place as extra checks for when we do have an allocator
  1452. Assert(this->ctAllocator == nullptr || this->currentSurrogatePairNode != nullptr);
  1453. Assert(this->ctAllocator == nullptr || current == this->currentSurrogatePairNode->location);
  1454. ECConsume(lengthOfSurrogate);
  1455. }
  1456. //Don't fall through
  1457. break;
  1458. }
  1459. else if (ECCanConsume(4) &&
  1460. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1461. standardEncodedChars->IsHex(ECLookahead(1)) &&
  1462. standardEncodedChars->IsHex(ECLookahead(2)) &&
  1463. standardEncodedChars->IsHex(ECLookahead(3)))
  1464. {
  1465. if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1466. {
  1467. codepoint_t value = (standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  1468. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  1469. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  1470. (standardEncodedChars->DigitValue(ECLookahead(3)));
  1471. TrackIfSurrogatePair(value, (next - 1), 5);
  1472. }
  1473. ECConsume(4);
  1474. }
  1475. break;
  1476. }
  1477. // embedded 0 is ok
  1478. return false;
  1479. }
  1480. }
  1481. #pragma warning(pop)
  1482. template <typename P, const bool IsLiteral>
  1483. bool Parser<P, IsLiteral>::AtomEscapePass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
  1484. {
  1485. Assert(!IsEOF()); // checked for terminating 0 in pass 0
  1486. if (standardEncodedChars->IsDigit(ECLookahead()))
  1487. {
  1488. // As per Annex B, allow octal escapes as well as group references, disambiguate based on known
  1489. // number of groups.
  1490. if (ECLookahead() == '0')
  1491. {
  1492. // fall through for octal
  1493. }
  1494. else
  1495. {
  1496. // Could be a group reference, but only if between 1 and 5 digits which resolve to a valid group number
  1497. int n = 0;
  1498. CharCount digits = 0;
  1499. do
  1500. {
  1501. n = n * 10 + (int)standardEncodedChars->DigitValue(ECLookahead(digits));
  1502. digits++;
  1503. }
  1504. while (digits < 5 && ECCanConsume(digits + 1) && standardEncodedChars->IsDigit(ECLookahead(digits)));
  1505. if (n >= numGroups ||
  1506. (ECCanConsume(digits + 1) && standardEncodedChars->IsDigit(ECLookahead(digits))))
  1507. {
  1508. if (standardEncodedChars->IsOctal(ECLookahead()))
  1509. {
  1510. // fall through for octal
  1511. }
  1512. else
  1513. {
  1514. // \8 and \9 are identity escapes
  1515. deferredCharNode->cs[0] = Chars<EncodedChar>::CTW(ECLookahead());
  1516. ECConsume();
  1517. node = deferredCharNode;
  1518. return false; // not an assertion
  1519. }
  1520. }
  1521. else
  1522. {
  1523. ECConsume(digits);
  1524. node = Anew(ctAllocator, MatchGroupNode, n);
  1525. return false; // not an assertion
  1526. }
  1527. }
  1528. // Must be between 1 and 3 octal digits
  1529. Assert(standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not an octal
  1530. uint n = 0;
  1531. CharCount digits = 0;
  1532. do
  1533. {
  1534. uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
  1535. if (m > Chars<uint8>::MaxUChar) // Regex octal codes only support single byte (ASCII) characters.
  1536. break;
  1537. n = m;
  1538. ECConsume();
  1539. digits++;
  1540. }
  1541. while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not an octal
  1542. deferredCharNode->cs[0] = UTC((UChar)n);
  1543. node = deferredCharNode;
  1544. return false; // not an assertion
  1545. }
  1546. else if (ECLookahead() == 'c')
  1547. {
  1548. Char c;
  1549. if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
  1550. {
  1551. c = UTC(Chars<EncodedChar>::CTU(ECLookahead(1)) % 32);
  1552. ECConsume(2);
  1553. }
  1554. else
  1555. {
  1556. // SPEC DEVIATION: For non-letters or EOF, take the leading '\' to be itself, and
  1557. // don't consume the 'c' or letter.
  1558. c = '\\';
  1559. }
  1560. deferredCharNode->cs[0] = c;
  1561. node = deferredCharNode;
  1562. return false; // not an assertion
  1563. }
  1564. else
  1565. {
  1566. Char c = NextChar();
  1567. switch (c)
  1568. {
  1569. case 'b':
  1570. node = Anew(ctAllocator, WordBoundaryNode, false);
  1571. return true; // Is an assertion
  1572. case 'B':
  1573. node = Anew(ctAllocator, WordBoundaryNode, true);
  1574. return true; // Is an assertion
  1575. case 'f':
  1576. c = '\f';
  1577. break; // fall-through for identity escape
  1578. case 'n':
  1579. c = '\n';
  1580. break; // fall-through for identity escape
  1581. case 'r':
  1582. c = '\r';
  1583. break; // fall-through for identity escape
  1584. case 't':
  1585. c = '\t';
  1586. break; // fall-through for identity escape
  1587. case 'v':
  1588. c = '\v';
  1589. break; // fall-through for identity escape
  1590. case 'd':
  1591. {
  1592. MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
  1593. standardChars->SetDigits(ctAllocator, setNode->set);
  1594. node = setNode;
  1595. return false; // not an assertion
  1596. }
  1597. case 'D':
  1598. {
  1599. node = GetNodeWithValidCharacterSet('D');
  1600. return false; // not an assertion
  1601. }
  1602. case 's':
  1603. {
  1604. MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
  1605. standardChars->SetWhitespace(ctAllocator, setNode->set);
  1606. node = setNode;
  1607. return false; // not an assertion
  1608. }
  1609. case 'S':
  1610. {
  1611. node = GetNodeWithValidCharacterSet('S');
  1612. return false; // not an assertion
  1613. }
  1614. case 'w':
  1615. {
  1616. MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
  1617. if (this->unicodeFlagPresent && this->caseInsensitiveFlagPresent)
  1618. {
  1619. standardChars->SetWordIUChars(ctAllocator, setNode->set);
  1620. }
  1621. else
  1622. {
  1623. standardChars->SetWordChars(ctAllocator, setNode->set);
  1624. }
  1625. node = setNode;
  1626. return false; // not an assertion
  1627. }
  1628. case 'W':
  1629. {
  1630. node = GetNodeWithValidCharacterSet('W');
  1631. return false; // not an assertion
  1632. }
  1633. // case 'c': handled as special case above
  1634. case 'x':
  1635. if (ECCanConsume(2) &&
  1636. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1637. standardEncodedChars->IsHex(ECLookahead(1)))
  1638. {
  1639. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
  1640. (standardEncodedChars->DigitValue(ECLookahead(1))));
  1641. ECConsume(2);
  1642. // fall-through for identity escape
  1643. }
  1644. // Take to be identity escape if ill-formed as per Annex B
  1645. break;
  1646. case 'u':
  1647. if (unicodeFlagPresent && TryParseExtendedUnicodeEscape(c, previousSurrogatePart) > 0)
  1648. break;
  1649. else if (ECCanConsume(4) &&
  1650. standardEncodedChars->IsHex(ECLookahead(0)) &&
  1651. standardEncodedChars->IsHex(ECLookahead(1)) &&
  1652. standardEncodedChars->IsHex(ECLookahead(2)) &&
  1653. standardEncodedChars->IsHex(ECLookahead(3)))
  1654. {
  1655. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  1656. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  1657. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  1658. (standardEncodedChars->DigitValue(ECLookahead(3))));
  1659. ECConsume(4);
  1660. // fall-through for identity escape
  1661. }
  1662. // Take to be identity escape if ill-formed as per Annex B
  1663. break;
  1664. case '^':
  1665. case '$':
  1666. case '\\':
  1667. case '.':
  1668. case '*':
  1669. case '+':
  1670. case '?':
  1671. case '(':
  1672. case ')':
  1673. case '[':
  1674. case ']':
  1675. case '{':
  1676. case '}':
  1677. case '|':
  1678. case '/':
  1679. break; // fall-through for identity escape
  1680. default:
  1681. if (this->unicodeFlagPresent)
  1682. {
  1683. // As per #sec-forbidden-extensions, if unicode flag is present, we must disallow any other escape.
  1684. this->Fail(JSERR_RegExpInvalidEscape); // throw SyntaxError
  1685. }
  1686. // As per Annex B, allow anything other than newlines and above. Embedded 0 is ok
  1687. break;
  1688. }
  1689. // Must be an identity escape
  1690. deferredCharNode->cs[0] = c;
  1691. node = deferredCharNode;
  1692. return false; // not an assertion
  1693. }
  1694. }
  1695. template <typename P, const bool IsLiteral>
  1696. bool Parser<P, IsLiteral>::SurrogatePairPass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
  1697. {
  1698. if (!this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() || !unicodeFlagPresent)
  1699. {
  1700. return false;
  1701. }
  1702. if (this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
  1703. {
  1704. AssertMsg(!this->currentSurrogatePairNode->IsInsideRange(), "Should not be calling this pass if we are currently inside a range.");
  1705. char16 lower, upper;
  1706. uint tableIndex = 0, actualHigh = 0;
  1707. codepoint_t equivClass[CaseInsensitive::EquivClassSize];
  1708. if (caseInsensitiveFlagPresent && CaseInsensitive::RangeToEquivClass(tableIndex, this->currentSurrogatePairNode->value, this->currentSurrogatePairNode->value, actualHigh, equivClass))
  1709. {
  1710. Node *equivNode[CaseInsensitive::EquivClassSize];
  1711. int indexForNextNode = 0;
  1712. for (int i = 0; i < CaseInsensitive::EquivClassSize; i++)
  1713. {
  1714. bool alreadyAdded = false;
  1715. for (int j = 0; j < i; j++)
  1716. {
  1717. if (equivClass[i] == equivClass[j])
  1718. {
  1719. alreadyAdded = true;
  1720. break;
  1721. }
  1722. }
  1723. if (!alreadyAdded)
  1724. {
  1725. if (Js::NumberUtilities::IsInSupplementaryPlane(equivClass[i]))
  1726. {
  1727. Js::NumberUtilities::CodePointAsSurrogatePair(equivClass[i], &lower, &upper);
  1728. equivNode[indexForNextNode] = CreateSurrogatePairAtom(lower, upper);
  1729. }
  1730. else
  1731. {
  1732. equivNode[indexForNextNode] = Anew(ctAllocator, MatchCharNode, (Char)equivClass[i]);
  1733. }
  1734. indexForNextNode ++;
  1735. }
  1736. }
  1737. Assert(indexForNextNode > 0);
  1738. if (indexForNextNode == 1)
  1739. {
  1740. node = equivNode[0];
  1741. }
  1742. else
  1743. {
  1744. AltNode *altNode = Anew(ctAllocator, AltNode, equivNode[0], nullptr);
  1745. AltNode *altNodeTail = altNode;
  1746. for (int i = 1; i < indexForNextNode; i++)
  1747. {
  1748. altNodeTail->tail = Anew(ctAllocator, AltNode, equivNode[i], nullptr);
  1749. altNodeTail = altNodeTail->tail;
  1750. }
  1751. node = altNode;
  1752. }
  1753. }
  1754. else
  1755. {
  1756. Js::NumberUtilities::CodePointAsSurrogatePair(this->currentSurrogatePairNode->value, &lower, &upper);
  1757. node = CreateSurrogatePairAtom(lower, upper);
  1758. }
  1759. previousSurrogatePart = false;
  1760. Assert(ECCanConsume(this->currentSurrogatePairNode->length));
  1761. ECConsumeMultiUnit(this->currentSurrogatePairNode->length);
  1762. this->RestoreMultiUnits(this->currentSurrogatePairNode->multiUnits);
  1763. this->currentSurrogatePairNode = this->currentSurrogatePairNode->next;
  1764. return true;
  1765. }
  1766. return false;
  1767. }
  1768. //
  1769. // Classes
  1770. //
  1771. template <typename P, const bool IsLiteral>
  1772. bool Parser<P, IsLiteral>::AtSecondSingletonClassAtom()
  1773. {
  1774. Assert(ECLookahead() == '-');
  1775. if (ECLookahead(1) == '\\')
  1776. {
  1777. switch (ECLookahead(2))
  1778. {
  1779. case 'd':
  1780. case 'D':
  1781. case 's':
  1782. case 'S':
  1783. case 'w':
  1784. case 'W':
  1785. // These all denote non-singleton sets
  1786. return false;
  1787. default:
  1788. // fall-through for singleton
  1789. break;
  1790. }
  1791. }
  1792. return true;
  1793. }
  1794. template <typename P, const bool IsLiteral>
  1795. void Parser<P, IsLiteral>::CharacterClassPass0()
  1796. {
  1797. // Could be terminating 0
  1798. if (ECLookahead() == '^')
  1799. ECConsume();
  1800. EncodedChar nextChar = ECLookahead();
  1801. const EncodedChar* current;
  1802. codepoint_t lastCodepoint = INVALID_CODEPOINT;
  1803. codepoint_t pendingRangeStart = INVALID_CODEPOINT;
  1804. codepoint_t pendingRangeEnd = INVALID_CODEPOINT;
  1805. bool previousSurrogatePart = false;
  1806. while(nextChar != ']')
  1807. {
  1808. current = next;
  1809. if (nextChar == '\0' && IsEOF())
  1810. {
  1811. // Report as unclosed '['
  1812. Fail(JSERR_RegExpNoBracket);
  1813. return;
  1814. } // Otherwise embedded '\0' is ok
  1815. else if (nextChar == '\\')
  1816. {
  1817. // Consume, as classescapepass0 expects for it to be consumed
  1818. Char outChar = NextChar();
  1819. // If previousSurrogatePart = true upon leaving this method, then we are going to pass through here twice
  1820. // This is because \u{} escape sequence was encountered that is actually 2 characters, the second time we will pass consuming entire character
  1821. if (ClassEscapePass0(outChar, previousSurrogatePart))
  1822. {
  1823. lastCodepoint = outChar;
  1824. }
  1825. else
  1826. {
  1827. // Last codepoint isn't a singleton, so no codepoint tracking for the sake of ranges is needed.
  1828. lastCodepoint = INVALID_CODEPOINT;
  1829. // Unless we have a possible range end, cancel our range tracking.
  1830. if (pendingRangeEnd == INVALID_CODEPOINT)
  1831. {
  1832. pendingRangeStart = INVALID_CODEPOINT;
  1833. }
  1834. }
  1835. }
  1836. else if (nextChar == '-')
  1837. {
  1838. if (pendingRangeStart != INVALID_CODEPOINT || lastCodepoint == INVALID_CODEPOINT)
  1839. {
  1840. // '-' is the upper part of the range, with pendingRangeStart codepoint is the lower. Set lastCodePoint to '-' to check at the end of the while statement
  1841. // OR
  1842. // '-' is just a char, consume it and set as last char
  1843. lastCodepoint = NextChar();
  1844. }
  1845. else
  1846. {
  1847. pendingRangeStart = this->next == this->positionAfterLastSurrogate
  1848. ? this->valueOfLastSurrogate
  1849. : lastCodepoint;
  1850. lastCodepoint = INVALID_CODEPOINT;
  1851. NextChar();
  1852. }
  1853. // If we have a pattern of the form [\ud800-\udfff], we need this to be interpreted as a range.
  1854. // In order to achieve this, the two variables that we use to track surrogate pairs, namely
  1855. // tempLocationOfSurrogatePair and previousSurrogatePart, need to be in a certain state.
  1856. //
  1857. // We need to reset tempLocationOfSurrogatePair as it points to the first Unicode escape (\ud800)
  1858. // when we're here. We need to clear it in order not to have a surrogate pair when we process the
  1859. // second escape (\udfff).
  1860. //
  1861. // previousSurrogatePart is used when we have a code point in the \u{...} extended format and the
  1862. // character is in a supplementary plane. However, there is no need to change its value here. When
  1863. // such an escape sequence is encountered, the first call to ClassEscapePass0() sets the variable
  1864. // to true, but it rewinds the input back to the beginning of the escape sequence. The next
  1865. // iteration of the loop here will again call ClassEscape0() with the same character and the
  1866. // variable will this time be set to false. Therefore, the variable will always be false here.
  1867. tempLocationOfSurrogatePair = nullptr;
  1868. Assert(!previousSurrogatePart);
  1869. }
  1870. else
  1871. {
  1872. lastCodepoint = NextChar();
  1873. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1874. {
  1875. TrackIfSurrogatePair(lastCodepoint, current, (uint32)(next - current));
  1876. }
  1877. }
  1878. if (!scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  1879. {
  1880. // This is much easier to handle.
  1881. if (pendingRangeStart != INVALID_CODEPOINT && lastCodepoint != INVALID_CODEPOINT)
  1882. {
  1883. Assert(pendingRangeStart < 0x10000);
  1884. Assert(pendingRangeEnd == INVALID_CODEPOINT);
  1885. if (pendingRangeStart > lastCodepoint)
  1886. {
  1887. Fail(JSERR_RegExpBadRange);
  1888. }
  1889. pendingRangeStart = lastCodepoint = INVALID_CODEPOINT;
  1890. }
  1891. }
  1892. // We have a candidate for range end, and we have a range start.
  1893. else if (pendingRangeStart != INVALID_CODEPOINT && (lastCodepoint != INVALID_CODEPOINT || pendingRangeEnd != INVALID_CODEPOINT))
  1894. {
  1895. // The following will be true at the end of each surrogate pair parse.
  1896. // Note, the escape sequence \u{} is a two time parse, so this will be true on the second time around.
  1897. if (this->next == this->positionAfterLastSurrogate)
  1898. {
  1899. lastCodepoint = this->valueOfLastSurrogate;
  1900. Assert(!previousSurrogatePart);
  1901. pendingRangeEnd = lastCodepoint;
  1902. lastCodepoint = INVALID_CODEPOINT;
  1903. }
  1904. // If the next character is the end of range ']', then we can't have a surrogate pair.
  1905. // The current character is the range end, if we don't already have a candidate.
  1906. else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)
  1907. {
  1908. pendingRangeEnd = lastCodepoint;
  1909. }
  1910. //If we get here, and pendingRangeEnd is set. Then one of the above has caused it to be set, or the previous iteration of the loop.
  1911. if (pendingRangeEnd != INVALID_CODEPOINT)
  1912. {
  1913. char16 leftSingleChar, rightSingleChar, ignore;
  1914. if (pendingRangeStart >= 0x10000)
  1915. {
  1916. Js::NumberUtilities::CodePointAsSurrogatePair(pendingRangeStart, &ignore, &leftSingleChar);
  1917. }
  1918. else
  1919. {
  1920. leftSingleChar = (char16)pendingRangeStart;
  1921. }
  1922. if (pendingRangeEnd >= 0x10000)
  1923. {
  1924. Js::NumberUtilities::CodePointAsSurrogatePair(pendingRangeEnd, &rightSingleChar, &ignore);
  1925. }
  1926. else
  1927. {
  1928. rightSingleChar = (char16)pendingRangeEnd;
  1929. }
  1930. // Here it is a bit tricky, we don't know if we have a unicode option specified.
  1931. // If it is, then \ud800\udc00 - \ud800\udc01 is valid, otherwise invalid.
  1932. if (pendingRangeStart < 0x10000 && pendingRangeEnd < 0x10000 && pendingRangeStart > pendingRangeEnd)
  1933. {
  1934. Fail(JSERR_RegExpBadRange);
  1935. }
  1936. else
  1937. {
  1938. if(leftSingleChar > rightSingleChar)
  1939. {
  1940. DeferredFailIfNotUnicode(JSERR_RegExpBadRange);
  1941. }
  1942. if (pendingRangeStart > pendingRangeEnd)
  1943. {
  1944. DeferredFailIfUnicode(JSERR_RegExpBadRange);
  1945. }
  1946. }
  1947. pendingRangeStart = pendingRangeEnd = INVALID_CODEPOINT;
  1948. }
  1949. // The current char < 0x10000 is a candidate for the range end, but we need to iterate one more time.
  1950. else
  1951. {
  1952. pendingRangeEnd = lastCodepoint;
  1953. }
  1954. }
  1955. nextChar = ECLookahead();
  1956. }
  1957. // We should never have a pendingRangeEnd set when we exit the loop
  1958. Assert(pendingRangeEnd == INVALID_CODEPOINT);
  1959. }
  1960. template <typename P, const bool IsLiteral>
  1961. template <bool containsSurrogates>
  1962. Node* Parser<P, IsLiteral>::CharacterClassPass1()
  1963. {
  1964. Assert(containsSurrogates ? unicodeFlagPresent : true);
  1965. CharSet<codepoint_t> codePointSet;
  1966. MatchSetNode deferredSetNode(false, false);
  1967. MatchCharNode deferredCharNode(0);
  1968. bool isNegation = false;
  1969. if (ECLookahead() == '^')
  1970. {
  1971. isNegation = true;
  1972. ECConsume();
  1973. }
  1974. // We aren't expecting any terminating null characters, only embedded ones that should treated as valid characters.
  1975. // CharacterClassPass0 should have taken care of terminating null.
  1976. codepoint_t pendingCodePoint = INVALID_CODEPOINT;
  1977. codepoint_t pendingRangeStart = INVALID_CODEPOINT;
  1978. EncodedChar nextChar = ECLookahead();
  1979. bool previousWasASurrogate = false;
  1980. bool currIsACharSet = false;
  1981. bool prevWasACharSetAndPartOfRange = false;
  1982. bool prevprevWasACharSetAndPartOfRange = false;
  1983. while(nextChar != ']')
  1984. {
  1985. codepoint_t codePointToSet = INVALID_CODEPOINT;
  1986. // Consume ahead of time if we have two backslashes, both cases below (previously Tracked surrogate pair, and ClassEscapePass1) assume it is.
  1987. if (nextChar == '\\')
  1988. {
  1989. ECConsume();
  1990. }
  1991. // These if-blocks are the logical ClassAtomPass1, they weren't grouped into a method to simplify dealing with multiple out parameters.
  1992. if (containsSurrogates && this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
  1993. {
  1994. codePointToSet = pendingCodePoint;
  1995. pendingCodePoint = this->currentSurrogatePairNode->value;
  1996. Assert(ECCanConsume(this->currentSurrogatePairNode->length));
  1997. ECConsumeMultiUnit(this->currentSurrogatePairNode->length);
  1998. this->RestoreMultiUnits(this->currentSurrogatePairNode->multiUnits);
  1999. this->currentSurrogatePairNode = this->currentSurrogatePairNode->next;
  2000. }
  2001. else if (nextChar == '\\')
  2002. {
  2003. Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);
  2004. codePointToSet = pendingCodePoint;
  2005. if (returnedNode->tag == Node::MatchSet)
  2006. {
  2007. if (pendingRangeStart != INVALID_CODEPOINT)
  2008. {
  2009. if (unicodeFlagPresent)
  2010. {
  2011. //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
  2012. //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
  2013. Fail(JSERR_UnicodeRegExpRangeContainsCharClass); //From #sec-patterns-static-semantics-early-errors-annexb
  2014. }
  2015. codePointSet.Set(ctAllocator, '-');
  2016. }
  2017. pendingCodePoint = INVALID_CODEPOINT;
  2018. pendingRangeStart = INVALID_CODEPOINT;
  2019. codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);
  2020. currIsACharSet = true;
  2021. }
  2022. else
  2023. {
  2024. // Just a character
  2025. pendingCodePoint = deferredCharNode.cs[0];
  2026. }
  2027. }
  2028. else if (nextChar == '-')
  2029. {
  2030. if (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT || ECLookahead(1) == ']')
  2031. {
  2032. // - is just a char, or end of a range.
  2033. codePointToSet = pendingCodePoint;
  2034. pendingCodePoint = '-';
  2035. ECConsume();
  2036. }
  2037. else
  2038. {
  2039. pendingRangeStart = pendingCodePoint;
  2040. ECConsume();
  2041. }
  2042. }
  2043. else
  2044. {
  2045. // Just a character, consume it
  2046. codePointToSet = pendingCodePoint;
  2047. pendingCodePoint = NextChar();
  2048. }
  2049. if (codePointToSet != INVALID_CODEPOINT || prevprevWasACharSetAndPartOfRange)
  2050. {
  2051. if (prevprevWasACharSetAndPartOfRange)
  2052. {
  2053. //We a range containing a character class and the unicode flag is present, thus we end up having to throw a "Syntax" error here
  2054. //This breaks the notion of Pass0 check for valid syntax, because during that time, the unicode flag is unknown.
  2055. if (unicodeFlagPresent)
  2056. {
  2057. Fail(JSERR_UnicodeRegExpRangeContainsCharClass);
  2058. }
  2059. if (pendingCodePoint != INVALID_CODEPOINT)
  2060. {
  2061. codePointSet.Set(ctAllocator, pendingCodePoint);
  2062. }
  2063. codePointSet.Set(ctAllocator, '-'); //Add '-' to set because a range was detected but turned out to be a union of character set with '-' and another atom.
  2064. pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
  2065. }
  2066. else if (pendingRangeStart != INVALID_CODEPOINT)
  2067. {
  2068. if (pendingRangeStart > pendingCodePoint)
  2069. {
  2070. //We have no unicodeFlag, but current range contains surrogates, thus we may end up having to throw a "Syntax" error here
  2071. //This breaks the notion of Pass0 check for valid syntax, because we don't know if we have a unicode option
  2072. Assert(!unicodeFlagPresent);
  2073. Fail(JSERR_RegExpBadRange);
  2074. }
  2075. codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
  2076. pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
  2077. }
  2078. else
  2079. {
  2080. codePointSet.Set(ctAllocator, codePointToSet);
  2081. }
  2082. }
  2083. nextChar = ECLookahead();
  2084. prevprevWasACharSetAndPartOfRange = prevWasACharSetAndPartOfRange;
  2085. prevWasACharSetAndPartOfRange = currIsACharSet && nextChar == '-';
  2086. currIsACharSet = false;
  2087. }
  2088. if (pendingCodePoint != INVALID_CODEPOINT)
  2089. {
  2090. codePointSet.Set(ctAllocator, pendingCodePoint);
  2091. }
  2092. // At this point, we have a complete set of codepoints representing the range.
  2093. // Before performing translation of any kind, we need to do some case filling.
  2094. // At the point of this comment, there are no case mappings going cross-plane between simple
  2095. // characters (< 0x10000) and supplementary characters (>= 0x10000)
  2096. // However, it might still be the case, and this has to be handled.
  2097. // On the other hand, we don't want to prevent optimizations that expect non-casefolded sets from happening.
  2098. // At least for simple characters.
  2099. // The simple case, is when the unicode flag isn't specified, we can go ahead and return the simple set.
  2100. // Negations and case mappings will be handled later.
  2101. if (!unicodeFlagPresent)
  2102. {
  2103. Assert(codePointSet.SimpleCharCount() == codePointSet.Count());
  2104. MatchSetNode *simpleToReturn = Anew(ctAllocator, MatchSetNode, isNegation);
  2105. codePointSet.CloneSimpleCharsTo(ctAllocator, simpleToReturn->set);
  2106. return simpleToReturn;
  2107. }
  2108. // Everything past here must be under the flag
  2109. Assert(scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  2110. if (codePointSet.IsEmpty())
  2111. {
  2112. return Anew(ctAllocator, MatchSetNode, false, false);
  2113. }
  2114. Node* prefixNode = nullptr;
  2115. Node* suffixNode = nullptr;
  2116. CharSet<codepoint_t> *toUseForTranslation = &codePointSet;
  2117. // If a singleton, return a simple character
  2118. bool isSingleton = !this->caseInsensitiveFlagPresent && !isNegation && codePointSet.IsSingleton();
  2119. if (isSingleton)
  2120. {
  2121. codepoint_t singleton = codePointSet.Singleton();
  2122. Node* toReturn = nullptr;
  2123. if (singleton < 0x10000)
  2124. {
  2125. toReturn = Anew(ctAllocator, MatchCharNode, (char16)singleton);
  2126. }
  2127. else
  2128. {
  2129. Assert(unicodeFlagPresent);
  2130. char16 lowerSurrogate, upperSurrogate;
  2131. Js::NumberUtilities::CodePointAsSurrogatePair(singleton, &lowerSurrogate, &upperSurrogate);
  2132. toReturn = CreateSurrogatePairAtom(lowerSurrogate, upperSurrogate);
  2133. }
  2134. codePointSet.Clear(ctAllocator);
  2135. return toReturn;
  2136. }
  2137. // If negation, we want to complement the simple chars.
  2138. // When a set is negated, optimizations skip checking if applicable, so we can go ahead and negate it here.
  2139. CharSet<codepoint_t> negatedSet;
  2140. if (!this->caseInsensitiveFlagPresent)
  2141. {
  2142. if (isNegation)
  2143. {
  2144. // Complement all characters, and use it as the set toTranslate
  2145. codePointSet.ToComplement(ctAllocator, negatedSet);
  2146. }
  2147. toUseForTranslation = isNegation ? &negatedSet : &codePointSet;
  2148. if (isNegation)
  2149. {
  2150. // Clear this, as we will no longer need this.
  2151. codePointSet.FreeBody(ctAllocator);
  2152. }
  2153. }
  2154. else
  2155. {
  2156. CharSet<codepoint_t> caseEquivalent;
  2157. codePointSet.ToEquivClass(ctAllocator, caseEquivalent);
  2158. // Equiv set can't have a reduced count of chars
  2159. Assert(caseEquivalent.Count() >= codePointSet.Count());
  2160. // Here we have a regex that has both case insensitive and unicode options.
  2161. // The range might also be negated. If it is negated, we can go ahead and negate
  2162. // the entire set as well as fill in cases, as optimizations wouldn't kick in anyways.
  2163. if (isNegation)
  2164. {
  2165. codePointSet.Clear(ctAllocator);
  2166. caseEquivalent.ToComplement(ctAllocator, codePointSet);
  2167. caseEquivalent.FreeBody(ctAllocator);
  2168. }
  2169. else
  2170. {
  2171. codePointSet.CloneFrom(ctAllocator, caseEquivalent);
  2172. }
  2173. Assert(toUseForTranslation == &codePointSet);
  2174. }
  2175. uint totalCodePointsCount = toUseForTranslation->Count();
  2176. uint simpleCharsCount = toUseForTranslation->SimpleCharCount();
  2177. if (totalCodePointsCount == simpleCharsCount)
  2178. {
  2179. MatchSetNode *simpleToReturn = Anew(ctAllocator, MatchSetNode, isNegation);
  2180. toUseForTranslation->CloneSimpleCharsTo(ctAllocator, simpleToReturn->set);
  2181. return simpleToReturn;
  2182. }
  2183. if (simpleCharsCount > 0)
  2184. {
  2185. if (!toUseForTranslation->ContainSurrogateCodeUnits())
  2186. {
  2187. MatchSetNode *node = Anew(ctAllocator, MatchSetNode, false, false);
  2188. toUseForTranslation->CloneSimpleCharsTo(ctAllocator, node->set);
  2189. prefixNode = node;
  2190. }
  2191. else
  2192. {
  2193. MatchSetNode *node = Anew(ctAllocator, MatchSetNode, false, false);
  2194. toUseForTranslation->CloneNonSurrogateCodeUnitsTo(ctAllocator, node->set);
  2195. prefixNode = node;
  2196. node = Anew(ctAllocator, MatchSetNode, false, false);
  2197. toUseForTranslation->CloneSurrogateCodeUnitsTo(ctAllocator, node->set);
  2198. suffixNode = node;
  2199. }
  2200. }
  2201. Assert(unicodeFlagPresent);
  2202. AltNode *headToReturn = prefixNode == nullptr ? nullptr : Anew(ctAllocator, AltNode, prefixNode, nullptr);
  2203. AltNode *currentTail = headToReturn;
  2204. codepoint_t charRangeSearchIndex = 0x10000, lowerCharOfRange = 0, upperCharOfRange = 0;
  2205. while (toUseForTranslation->GetNextRange(charRangeSearchIndex, &lowerCharOfRange, &upperCharOfRange))
  2206. {
  2207. if (lowerCharOfRange == upperCharOfRange)
  2208. {
  2209. currentTail = this->AppendSurrogatePairToDisjunction(lowerCharOfRange, currentTail);
  2210. }
  2211. else
  2212. {
  2213. currentTail = this->AppendSurrogateRangeToDisjunction(lowerCharOfRange, upperCharOfRange, currentTail);
  2214. }
  2215. if (headToReturn == nullptr)
  2216. {
  2217. headToReturn = currentTail;
  2218. }
  2219. AnalysisAssert(currentTail != nullptr);
  2220. while (currentTail->tail != nullptr)
  2221. {
  2222. currentTail = currentTail->tail;
  2223. }
  2224. charRangeSearchIndex = upperCharOfRange + 1;
  2225. }
  2226. if (suffixNode != nullptr)
  2227. {
  2228. currentTail->tail = Anew(ctAllocator, AltNode, suffixNode, nullptr);
  2229. }
  2230. toUseForTranslation->Clear(ctAllocator);
  2231. if (headToReturn != nullptr && headToReturn->tail == nullptr)
  2232. {
  2233. return headToReturn->head;
  2234. }
  2235. return headToReturn;
  2236. }
  2237. #pragma warning(push)
  2238. #pragma warning(disable:4702) // unreachable code
  2239. template <typename P, const bool IsLiteral>
  2240. bool Parser<P, IsLiteral>::ClassEscapePass0(Char& singleton, bool& previousSurrogatePart)
  2241. {
  2242. // Could be terminating 0
  2243. EncodedChar ec = ECLookahead();
  2244. if (ec == 0 && IsEOF())
  2245. {
  2246. Fail(JSERR_RegExpSyntax);
  2247. return false;
  2248. }
  2249. else if (standardEncodedChars->IsOctal(ec))
  2250. {
  2251. uint n = 0;
  2252. CharCount digits = 0;
  2253. do
  2254. {
  2255. uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
  2256. if (m > Chars<uint8>::MaxUChar) //Regex octal codes only support single byte (ASCII) characters.
  2257. break;
  2258. n = m;
  2259. ECConsume();
  2260. digits++;
  2261. }
  2262. while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not octal
  2263. singleton = UTC((UChar)n);
  2264. // Clear possible pair
  2265. this->tempLocationOfSurrogatePair = nullptr;
  2266. return true;
  2267. }
  2268. else
  2269. {
  2270. const EncodedChar* location = this->tempLocationOfSurrogatePair;
  2271. // Clear it for now, otherwise to many branches to clear it on.
  2272. this->tempLocationOfSurrogatePair = nullptr;
  2273. // An escaped '/' is ok
  2274. Char c = NextChar();
  2275. switch (c)
  2276. {
  2277. case 'b':
  2278. singleton = '\b';
  2279. return true;
  2280. case 'f':
  2281. singleton = '\f';
  2282. return true;
  2283. case 'n':
  2284. singleton = '\n';
  2285. return true;
  2286. case 'r':
  2287. singleton = '\r';
  2288. return true;
  2289. case 't':
  2290. singleton = '\t';
  2291. return true;
  2292. case 'v':
  2293. singleton = '\v';
  2294. return true;
  2295. case 'd':
  2296. case 'D':
  2297. case 's':
  2298. case 'S':
  2299. case 'w':
  2300. case 'W':
  2301. return false;
  2302. case 'c':
  2303. if (standardEncodedChars->IsLetter(ECLookahead())) // terminating 0 is not a letter
  2304. {
  2305. singleton = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
  2306. ECConsume();
  2307. }
  2308. else
  2309. {
  2310. if (!IsEOF())
  2311. {
  2312. EncodedChar ecLookahead = ECLookahead();
  2313. switch (ecLookahead)
  2314. {
  2315. case '-':
  2316. case ']':
  2317. singleton = c;
  2318. break;
  2319. default:
  2320. singleton = UTC(Chars<EncodedChar>::CTU(ecLookahead) % 32);
  2321. ECConsume();
  2322. break;
  2323. }
  2324. }
  2325. else
  2326. singleton = c;
  2327. }
  2328. return true;
  2329. case 'x':
  2330. if (ECCanConsume(2) &&
  2331. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2332. standardEncodedChars->IsHex(ECLookahead(1)))
  2333. {
  2334. singleton = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
  2335. (standardEncodedChars->DigitValue(ECLookahead(1))));
  2336. ECConsume(2);
  2337. }
  2338. else
  2339. singleton = c;
  2340. return true;
  2341. case 'u':
  2342. this->tempLocationOfSurrogatePair = location;
  2343. if (this->TryParseExtendedUnicodeEscape(singleton, previousSurrogatePart, true) > 0)
  2344. return true;
  2345. else if (ECCanConsume(4) &&
  2346. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2347. standardEncodedChars->IsHex(ECLookahead(1)) &&
  2348. standardEncodedChars->IsHex(ECLookahead(2)) &&
  2349. standardEncodedChars->IsHex(ECLookahead(3)))
  2350. {
  2351. singleton = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  2352. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  2353. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  2354. (standardEncodedChars->DigitValue(ECLookahead(3))));
  2355. if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  2356. {
  2357. // Current location
  2358. TrackIfSurrogatePair(singleton, (next - 1), 5);
  2359. }
  2360. // The above if statement, if true, will clear tempLocationOfSurrogatePair if needs to.
  2361. ECConsume(4);
  2362. }
  2363. else
  2364. singleton = c;
  2365. return true;
  2366. default:
  2367. // embedded 0 is ok
  2368. singleton = c;
  2369. return true;
  2370. }
  2371. }
  2372. }
  2373. #pragma warning(pop)
  2374. template <typename P, const bool IsLiteral>
  2375. Node* Parser<P, IsLiteral>::ClassEscapePass1(MatchCharNode* deferredCharNode, MatchSetNode* deferredSetNode, bool& previousSurrogatePart)
  2376. {
  2377. // Checked for terminating 0 is pass 0
  2378. Assert(!IsEOF());
  2379. if (standardEncodedChars->IsOctal(ECLookahead()))
  2380. {
  2381. // As per Annex B, allow octal escapes instead of just \0 (and \8 and \9 are identity escapes).
  2382. // Must be between 1 and 3 octal digits.
  2383. uint n = 0;
  2384. CharCount digits = 0;
  2385. do
  2386. {
  2387. uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
  2388. if (m > Chars<uint8>::MaxUChar) //Regex octal codes only support single byte (ASCII) characters.
  2389. break;
  2390. n = m;
  2391. ECConsume();
  2392. digits++;
  2393. }
  2394. while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not octal
  2395. deferredCharNode->cs[0] = UTC((UChar)n);
  2396. return deferredCharNode;
  2397. }
  2398. else
  2399. {
  2400. Char c = NextChar();
  2401. switch (c)
  2402. {
  2403. case 'b':
  2404. c = '\b';
  2405. break; // fall-through for identity escape
  2406. case 'f':
  2407. c = '\f';
  2408. break; // fall-through for identity escape
  2409. case 'n':
  2410. c = '\n';
  2411. break; // fall-through for identity escape
  2412. case 'r':
  2413. c = '\r';
  2414. break; // fall-through for identity escape
  2415. case 't':
  2416. c = '\t';
  2417. break; // fall-through for identity escape
  2418. case 'v':
  2419. c = '\v';
  2420. break; // fall-through for identity escape
  2421. case 'd':
  2422. standardChars->SetDigits(ctAllocator, deferredSetNode->set);
  2423. return deferredSetNode;
  2424. case 'D':
  2425. standardChars->SetNonDigits(ctAllocator, deferredSetNode->set);
  2426. return deferredSetNode;
  2427. case 's':
  2428. standardChars->SetWhitespace(ctAllocator, deferredSetNode->set);
  2429. return deferredSetNode;
  2430. case 'S':
  2431. standardChars->SetNonWhitespace(ctAllocator, deferredSetNode->set);
  2432. return deferredSetNode;
  2433. case 'w':
  2434. standardChars->SetWordChars(ctAllocator, deferredSetNode->set);
  2435. return deferredSetNode;
  2436. case 'W':
  2437. standardChars->SetNonWordChars(ctAllocator, deferredSetNode->set);
  2438. return deferredSetNode;
  2439. case 'c':
  2440. if (standardEncodedChars->IsLetter(ECLookahead())) // terminating 0 is not a letter
  2441. {
  2442. c = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
  2443. ECConsume();
  2444. // fall-through for identity escape
  2445. }
  2446. else
  2447. {
  2448. // SPEC DEVIATION: For non-letters, still take lower 5 bits, e.g. [\c1] == [\x11].
  2449. // However, '-', ']', and EOF make the \c just a 'c'.
  2450. if (!IsEOF())
  2451. {
  2452. EncodedChar ec = ECLookahead();
  2453. switch (ec)
  2454. {
  2455. case '-':
  2456. case ']':
  2457. // fall-through for identity escape with 'c'
  2458. break;
  2459. default:
  2460. c = UTC(Chars<EncodedChar>::CTU(ec) % 32);
  2461. ECConsume();
  2462. // fall-through for identity escape
  2463. break;
  2464. }
  2465. }
  2466. // else: fall-through for identity escape with 'c'
  2467. }
  2468. break;
  2469. case 'x':
  2470. if (ECCanConsume(2) &&
  2471. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2472. standardEncodedChars->IsHex(ECLookahead(1)))
  2473. {
  2474. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
  2475. (standardEncodedChars->DigitValue(ECLookahead(1))));
  2476. ECConsume(2);
  2477. // fall-through for identity escape
  2478. }
  2479. // Take to be identity escape if ill-formed as per Annex B
  2480. break;
  2481. case 'u':
  2482. if (unicodeFlagPresent && TryParseExtendedUnicodeEscape(c, previousSurrogatePart) > 0)
  2483. break;
  2484. else if (ECCanConsume(4) &&
  2485. standardEncodedChars->IsHex(ECLookahead(0)) &&
  2486. standardEncodedChars->IsHex(ECLookahead(1)) &&
  2487. standardEncodedChars->IsHex(ECLookahead(2)) &&
  2488. standardEncodedChars->IsHex(ECLookahead(3)))
  2489. {
  2490. c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
  2491. (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
  2492. (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
  2493. (standardEncodedChars->DigitValue(ECLookahead(3))));
  2494. ECConsume(4);
  2495. // fall-through for identity escape
  2496. }
  2497. // Take to be identity escape if ill-formed as per Annex B.
  2498. break;
  2499. default:
  2500. // As per Annex B, allow anything other than newlines and above. Embedded 0 is ok.
  2501. break;
  2502. }
  2503. // Must be an identity escape
  2504. deferredCharNode->cs[0] = c;
  2505. return deferredCharNode;
  2506. }
  2507. }
  2508. //
  2509. // Options
  2510. //
  2511. template <typename P, const bool IsLiteral>
  2512. void Parser<P, IsLiteral>::Options(RegexFlags& flags)
  2513. {
  2514. while (true)
  2515. {
  2516. // Could be terminating 0
  2517. EncodedChar ec = ECLookahead();
  2518. CharCount consume;
  2519. Char c;
  2520. if (ec == 0)
  2521. // Embedded 0 not valid
  2522. return;
  2523. else if (IsLiteral &&
  2524. ec == '\\' &&
  2525. ECCanConsume(6) &&
  2526. ECLookahead(1) == 'u' &&
  2527. standardEncodedChars->IsHex(ECLookahead(2)) &&
  2528. standardEncodedChars->IsHex(ECLookahead(3)) &&
  2529. standardEncodedChars->IsHex(ECLookahead(4)) &&
  2530. standardEncodedChars->IsHex(ECLookahead(5)))
  2531. {
  2532. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  2533. {
  2534. Fail(JSERR_RegExpSyntax);
  2535. return;
  2536. }
  2537. else
  2538. {
  2539. uint32 n = (standardEncodedChars->DigitValue(ECLookahead(2)) << 12) |
  2540. (standardEncodedChars->DigitValue(ECLookahead(3)) << 8) |
  2541. (standardEncodedChars->DigitValue(ECLookahead(4)) << 4) |
  2542. (standardEncodedChars->DigitValue(ECLookahead(5)));
  2543. c = UTC(n);
  2544. consume = 6;
  2545. }
  2546. }
  2547. else
  2548. {
  2549. c = Chars<EncodedChar>::CTW(ec);
  2550. consume = 1;
  2551. }
  2552. switch (c) {
  2553. case 'i':
  2554. if ((flags & IgnoreCaseRegexFlag) != 0)
  2555. {
  2556. Fail(JSERR_RegExpSyntax);
  2557. }
  2558. flags = (RegexFlags)(flags | IgnoreCaseRegexFlag);
  2559. break;
  2560. case 'g':
  2561. if ((flags & GlobalRegexFlag) != 0)
  2562. {
  2563. Fail(JSERR_RegExpSyntax);
  2564. }
  2565. flags = (RegexFlags)(flags | GlobalRegexFlag);
  2566. break;
  2567. case 'm':
  2568. if ((flags & MultilineRegexFlag) != 0)
  2569. {
  2570. Fail(JSERR_RegExpSyntax);
  2571. }
  2572. flags = (RegexFlags)(flags | MultilineRegexFlag);
  2573. break;
  2574. case 'u':
  2575. // If we don't have unicode enabled, fall through to default
  2576. if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
  2577. {
  2578. if ((flags & UnicodeRegexFlag) != 0)
  2579. {
  2580. Fail(JSERR_RegExpSyntax);
  2581. }
  2582. flags = (RegexFlags)(flags | UnicodeRegexFlag);
  2583. // For telemetry
  2584. CHAKRATEL_LANGSTATS_INC_LANGFEATURECOUNT(ES6, UnicodeRegexFlag, scriptContext);
  2585. break;
  2586. }
  2587. case 'y':
  2588. if (scriptContext->GetConfig()->IsES6RegExStickyEnabled())
  2589. {
  2590. if ((flags & StickyRegexFlag) != 0)
  2591. {
  2592. Fail(JSERR_RegExpSyntax);
  2593. }
  2594. flags = (RegexFlags)(flags | StickyRegexFlag);
  2595. // For telemetry
  2596. CHAKRATEL_LANGSTATS_INC_LANGFEATURECOUNT(ES6, StickyRegexFlag, scriptContext);
  2597. break;
  2598. }
  2599. default:
  2600. if (standardChars->IsWord(c))
  2601. {
  2602. // Outer context could never parse this character. Signal the syntax error as
  2603. // being part of the regex.
  2604. Fail(JSERR_RegExpSyntax);
  2605. }
  2606. return;
  2607. }
  2608. ECConsume(consume);
  2609. }
  2610. }
  2611. //
  2612. // Entry points
  2613. //
  2614. template <typename P, const bool IsLiteral>
  2615. Node* Parser<P, IsLiteral>::ParseDynamic
  2616. ( const EncodedChar* body
  2617. , const EncodedChar* bodyLim
  2618. , const EncodedChar* opts
  2619. , const EncodedChar* optsLim
  2620. , RegexFlags& flags )
  2621. {
  2622. Assert(!IsLiteral);
  2623. Assert(body != 0);
  2624. Assert(bodyLim >= body && *bodyLim == 0);
  2625. Assert(opts == 0 || (optsLim >= opts && *optsLim == 0));
  2626. // Body, pass 0
  2627. SetPosition(body, bodyLim, true);
  2628. PatternPass0();
  2629. if (!IsEOF())
  2630. Fail(JSERR_RegExpSyntax);
  2631. // Options
  2632. if (opts != 0)
  2633. {
  2634. SetPosition(opts, optsLim, false);
  2635. Options(flags);
  2636. if (!IsEOF())
  2637. Fail(JSERR_RegExpSyntax);
  2638. this->unicodeFlagPresent = (flags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
  2639. this->caseInsensitiveFlagPresent = (flags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
  2640. Assert(!this->unicodeFlagPresent || scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  2641. }
  2642. else
  2643. {
  2644. this->unicodeFlagPresent = false;
  2645. this->caseInsensitiveFlagPresent = false;
  2646. }
  2647. // If this HR has been set, that means we have an earlier failure than the one caught above.
  2648. if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
  2649. {
  2650. throw ParseError(*deferredIfNotUnicodeError);
  2651. }
  2652. else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
  2653. {
  2654. throw ParseError(*deferredIfUnicodeError);
  2655. }
  2656. this->currentSurrogatePairNode = this->surrogatePairList;
  2657. // Body, pass 1
  2658. SetPosition(body, bodyLim, true);
  2659. Node* root = PatternPass1();
  2660. Assert(IsEOF());
  2661. return root;
  2662. }
  2663. template <typename P, const bool IsLiteral>
  2664. Node* Parser<P, IsLiteral>::ParseLiteral
  2665. ( const EncodedChar* input
  2666. , const EncodedChar* inputLim
  2667. , CharCount& outBodyEncodedChars
  2668. , CharCount& outTotalEncodedChars
  2669. , CharCount& outBodyChars
  2670. , CharCount& outTotalChars
  2671. , RegexFlags& flags )
  2672. {
  2673. Assert(IsLiteral);
  2674. Assert(input != 0);
  2675. Assert(inputLim >= input); // *inputLim need not be 0 because of deferred parsing
  2676. // To handle surrogate pairs properly under unicode option, we will collect information on location of the pairs
  2677. // during pass 0, regardless if the option is present. (We aren't able to get it at that time)
  2678. // During pass 1, we will use that information to correctly create appropriate nodes.
  2679. // Body, pass 0
  2680. SetPosition(input, inputLim, true);
  2681. PatternPass0();
  2682. outBodyEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2683. outBodyChars = Pos();
  2684. // Options are needed for the next pass
  2685. ECMust('/', ERRnoSlash);
  2686. Options(flags);
  2687. this->unicodeFlagPresent = (flags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
  2688. this->caseInsensitiveFlagPresent = (flags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
  2689. Assert(!this->unicodeFlagPresent || scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
  2690. // If this HR has been set, that means we have an earlier failure than the one caught above.
  2691. if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
  2692. {
  2693. throw ParseError(*deferredIfNotUnicodeError);
  2694. }
  2695. else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
  2696. {
  2697. throw ParseError(*deferredIfUnicodeError);
  2698. }
  2699. // Used below to proceed to the end of the regex
  2700. const EncodedChar *pastOptions = next;
  2701. this->currentSurrogatePairNode = this->surrogatePairList;
  2702. // Body, pass 1
  2703. SetPosition(input, inputLim, true);
  2704. Node* root = PatternPass1();
  2705. Assert(outBodyEncodedChars == Chars<EncodedChar>::OSB(next, input));
  2706. Assert(outBodyChars == Pos());
  2707. next = pastOptions;
  2708. outTotalEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2709. outTotalChars = Pos();
  2710. return root;
  2711. }
  2712. template <typename P, const bool IsLiteral>
  2713. void Parser<P, IsLiteral>::ParseLiteralNoAST
  2714. ( const EncodedChar* input
  2715. , const EncodedChar* inputLim
  2716. , CharCount& outBodyEncodedChars
  2717. , CharCount& outTotalEncodedChars
  2718. , CharCount& outBodyChars
  2719. , CharCount& outTotalChars )
  2720. {
  2721. Assert(IsLiteral);
  2722. Assert(input != 0);
  2723. Assert(inputLim >= input); // *inputLim need not be 0 because of deferred parsing
  2724. // Body, pass 0
  2725. SetPosition(input, inputLim, true);
  2726. PatternPass0();
  2727. outBodyEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2728. outBodyChars = Pos();
  2729. // Options
  2730. ECMust('/', ERRnoSlash);
  2731. RegexFlags dummyFlags = NoRegexFlags;
  2732. Options(dummyFlags);
  2733. this->unicodeFlagPresent = (dummyFlags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
  2734. this->caseInsensitiveFlagPresent = (dummyFlags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
  2735. outTotalEncodedChars = Chars<EncodedChar>::OSB(next, input);
  2736. outTotalChars = Pos();
  2737. // If this HR has been set, that means we have an earlier failure than the one caught above.
  2738. if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
  2739. {
  2740. throw ParseError(*deferredIfNotUnicodeError);
  2741. }
  2742. else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
  2743. {
  2744. throw ParseError(*deferredIfUnicodeError);
  2745. }
  2746. }
  2747. template <typename P, const bool IsLiteral>
  2748. template <const bool buildAST>
  2749. RegexPattern * Parser<P, IsLiteral>::CompileProgram
  2750. ( Node* root,
  2751. const EncodedChar*& currentCharacter,
  2752. const CharCount totalLen,
  2753. const CharCount bodyChars,
  2754. const CharCount bodyEncodedChars,
  2755. const CharCount totalChars,
  2756. const RegexFlags flags )
  2757. {
  2758. Assert(IsLiteral);
  2759. Program* program = nullptr;
  2760. if (buildAST)
  2761. {
  2762. const auto recycler = this->scriptContext->GetRecycler();
  2763. program = Program::New(recycler, flags);
  2764. this->CaptureSourceAndGroups(recycler, program, currentCharacter, bodyChars, bodyEncodedChars);
  2765. }
  2766. currentCharacter += totalLen;
  2767. Assert(GetMultiUnits() == totalLen - totalChars);
  2768. if (!buildAST)
  2769. {
  2770. return nullptr;
  2771. }
  2772. RegexPattern* pattern = RegexPattern::New(this->scriptContext, program, true);
  2773. #if ENABLE_REGEX_CONFIG_OPTIONS
  2774. RegexStats* stats = 0;
  2775. if (REGEX_CONFIG_FLAG(RegexProfile))
  2776. {
  2777. stats = this->scriptContext->GetRegexStatsDatabase()->GetRegexStats(pattern);
  2778. this->scriptContext->GetRegexStatsDatabase()->EndProfile(stats, RegexStats::Parse);
  2779. }
  2780. if (REGEX_CONFIG_FLAG(RegexTracing))
  2781. {
  2782. DebugWriter* tw = this->scriptContext->GetRegexDebugWriter();
  2783. tw->Print(_u("// REGEX COMPILE "));
  2784. pattern->Print(tw);
  2785. tw->EOL();
  2786. }
  2787. if (REGEX_CONFIG_FLAG(RegexProfile))
  2788. this->scriptContext->GetRegexStatsDatabase()->BeginProfile();
  2789. #endif
  2790. ArenaAllocator* rtAllocator = this->scriptContext->RegexAllocator();
  2791. Compiler::Compile
  2792. ( this->scriptContext
  2793. , ctAllocator
  2794. , rtAllocator
  2795. , standardChars
  2796. , program
  2797. , root
  2798. , this->GetLitbuf()
  2799. , pattern
  2800. #if ENABLE_REGEX_CONFIG_OPTIONS
  2801. , w
  2802. , stats
  2803. #endif
  2804. );
  2805. #if ENABLE_REGEX_CONFIG_OPTIONS
  2806. if (REGEX_CONFIG_FLAG(RegexProfile))
  2807. this->scriptContext->GetRegexStatsDatabase()->EndProfile(stats, RegexStats::Compile);
  2808. #endif
  2809. #ifdef PROFILE_EXEC
  2810. this->scriptContext->ProfileEnd(Js::RegexCompilePhase);
  2811. #endif
  2812. // CaptureSourceAndGroups throws if this condition doesn't hold.
  2813. Assert(0 < pattern->NumGroups() && pattern->NumGroups() <= MAX_NUM_GROUPS);
  2814. return pattern;
  2815. }
  2816. template <typename P, const bool IsLiteral>
  2817. void Parser<P, IsLiteral>::CaptureEmptySourceAndNoGroups(Program* program)
  2818. {
  2819. Assert(program->source == 0);
  2820. program->source = const_cast<Char*>(_u(""));
  2821. program->sourceLen = 0;
  2822. program->numGroups = 1;
  2823. // Remaining to set during compilation: litbuf, litbufLen, numLoops, insts, instsLen, entryPointLabel
  2824. }
  2825. template <typename P, const bool IsLiteral>
  2826. void Parser<P, IsLiteral>::CaptureSourceAndGroups(Recycler* recycler, Program* program, const EncodedChar* body, CharCount bodyChars, CharCount bodyEncodedChars)
  2827. {
  2828. Assert(program->source == 0);
  2829. Assert(body != 0);
  2830. // Program will own source string
  2831. program->source = RecyclerNewArrayLeaf(recycler, Char, bodyChars + 1);
  2832. // Don't need to zero out since we're writing to the buffer right here
  2833. this->ConvertToUnicode(program->source, bodyChars, body, body + bodyEncodedChars);
  2834. program->source[bodyChars] = 0;
  2835. program->sourceLen = bodyChars;
  2836. // We expect nextGroupId to be positive, because the full regexp itself always
  2837. // counts as a capturing group.
  2838. Assert(nextGroupId > 0);
  2839. if (nextGroupId > MAX_NUM_GROUPS)
  2840. {
  2841. Js::JavascriptError::ThrowRangeError(this->scriptContext, JSERR_RegExpTooManyCapturingGroups);
  2842. }
  2843. else
  2844. {
  2845. program->numGroups = static_cast<uint16>(nextGroupId);
  2846. }
  2847. // Remaining to set during compilation: litbuf, litbufLen, numLoops, insts, instsLen, entryPointLabel
  2848. }
  2849. template <typename P, const bool IsLiteral>
  2850. Node* Parser<P, IsLiteral>::GetNodeWithValidCharacterSet(EncodedChar cc)
  2851. {
  2852. Node* nodeToReturn = nullptr;
  2853. if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() && this->unicodeFlagPresent)
  2854. {
  2855. MatchSetNode *lowerRangeNode = Anew(ctAllocator, MatchSetNode, false, false);
  2856. lowerRangeNode->set.SetRange(ctAllocator, (Char)0xD800, (Char)0xDBFF);
  2857. MatchSetNode *upperRangeNode = Anew(ctAllocator, MatchSetNode, false, false);
  2858. upperRangeNode->set.SetRange(ctAllocator, (Char)0xDC00, (Char)0xDFFF);
  2859. ConcatNode* surrogateRangePairNode = Anew(ctAllocator, ConcatNode, lowerRangeNode, Anew(ctAllocator, ConcatNode, upperRangeNode, nullptr));
  2860. // The MatchSet node will be split into [0-D7FFDC00-FFFF] (minus special characters like newline, whitespace, etc.) as a prefix, and a suffix of [D800-DBFF]
  2861. // i.e. The MatchSet node can be with [0-D7FFDC00-FFFF] (minus special characters like newline, whitespace, etc.) OR [D800-DBFF]
  2862. MatchSetNode* partialPrefixSetNode = Anew(ctAllocator, MatchSetNode, false, false);
  2863. switch (cc)
  2864. {
  2865. case '.':
  2866. standardChars->SetNonNewline(ctAllocator, partialPrefixSetNode->set);
  2867. break;
  2868. case 'S':
  2869. standardChars->SetNonWhitespace(ctAllocator, partialPrefixSetNode->set);
  2870. break;
  2871. case 'D':
  2872. standardChars->SetNonDigits(ctAllocator, partialPrefixSetNode->set);
  2873. break;
  2874. case 'W':
  2875. if (this->caseInsensitiveFlagPresent)
  2876. {
  2877. standardChars->SetNonWordIUChars(ctAllocator, partialPrefixSetNode->set);
  2878. }
  2879. else
  2880. {
  2881. standardChars->SetNonWordChars(ctAllocator, partialPrefixSetNode->set);
  2882. }
  2883. break;
  2884. default:
  2885. AssertMsg(false, "");
  2886. }
  2887. partialPrefixSetNode->set.SubtractRange(ctAllocator, (Char)0xD800u, (Char)0xDBFFu);
  2888. MatchSetNode* partialSuffixSetNode = Anew(ctAllocator, MatchSetNode, false, false);
  2889. partialSuffixSetNode->set.SetRange(ctAllocator, (Char)0xD800u, (Char)0xDBFFu);
  2890. AltNode* altNode = Anew(ctAllocator, AltNode, partialPrefixSetNode, Anew(ctAllocator, AltNode, surrogateRangePairNode, Anew(ctAllocator, AltNode, partialSuffixSetNode, nullptr)));
  2891. nodeToReturn = altNode;
  2892. }
  2893. else
  2894. {
  2895. MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
  2896. switch (cc)
  2897. {
  2898. case '.':
  2899. standardChars->SetNonNewline(ctAllocator, setNode->set);
  2900. break;
  2901. case 'S':
  2902. standardChars->SetNonWhitespace(ctAllocator, setNode->set);
  2903. break;
  2904. case 'D':
  2905. standardChars->SetNonDigits(ctAllocator, setNode->set);
  2906. break;
  2907. case 'W':
  2908. standardChars->SetNonWordChars(ctAllocator, setNode->set);
  2909. break;
  2910. default:
  2911. AssertMsg(false, "");
  2912. }
  2913. nodeToReturn = setNode;
  2914. }
  2915. return nodeToReturn;
  2916. }
  2917. template <typename P, const bool IsLiteral>
  2918. void Parser<P, IsLiteral>::FreeBody()
  2919. {
  2920. if (litbuf != 0)
  2921. {
  2922. ctAllocator->Free(litbuf, litbufLen);
  2923. litbuf = nullptr;
  2924. litbufLen = 0;
  2925. litbufNext = 0;
  2926. }
  2927. }
  2928. // Instantiate all templates
  2929. #define INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, IsLiteral, BuildAST) \
  2930. template RegexPattern* Parser<EncodingPolicy, IsLiteral>::CompileProgram<BuildAST>(Node* root, const EncodedChar*& currentCharacter, const CharCount totalLen, const CharCount bodyChars, const CharCount bodyEncodedChars, const CharCount totalChars, const RegexFlags flags );
  2931. #define INSTANTIATE_REGEX_PARSER(EncodingPolicy) \
  2932. INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, false, false) \
  2933. INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, false, true) \
  2934. INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, true, false) \
  2935. INSTANTIATE_REGEX_PARSER_COMPILE(EncodingPolicy, true, true) \
  2936. template class Parser<EncodingPolicy, false>; \
  2937. template class Parser<EncodingPolicy, true>;
  2938. // Instantiate the Parser
  2939. INSTANTIATE_REGEX_PARSER(NullTerminatedUnicodeEncodingPolicy);
  2940. INSTANTIATE_REGEX_PARSER(NotNullTerminatedUTF8EncodingPolicy);
  2941. }