| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098 |
- //-------------------------------------------------------------------------------------------------------
- // Copyright (C) Microsoft. All rights reserved.
- // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
- //-------------------------------------------------------------------------------------------------------
- /*
- --------------------------------------------------------------------------------------------------------------------------------
- Original
- --------------------------------------------------------------------------------------------------------------------------------
- Pattern ::= Disjunction
- Disjunction ::= Alternative | Alternative '|' Disjunction
- Alternative ::= [empty] | Alternative Term
- Term ::= Assertion | Atom | Atom Quantifier
- Assertion ::= '^' | '$' | '\' 'b' | '\' 'B' | '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
- Quantifier ::= QuantifierPrefix | QuantifierPrefix '?'
- QuantifierPrefix ::= '*' | '+' | '?' | '{' DecimalDigits '}' | '{' DecimalDigits ',' '}' | '{' DecimalDigits ',' DecimalDigits '}'
- Atom ::= PatternCharacter | '.' | '\' AtomEscape | CharacterClass | '(' Disjunction ')' | '(' '?' ':' Disjunction ')'
- PatternCharacter ::= SourceCharacter but not any of { '^', '$', '\', '.', '*', '+', '?', '(', ')', '[', ']', '{', '}', '|' }
- AtomEscape ::= DecimalEscape | CharacterEscape | CharacterClassEscape
- CharacterEscape ::= ControlEscape | 'c' ControlLetter | HexEscapeSequence | UnicodeEscapeSequence | IdentityEscape
- ControlEscape ::= one of { 'f', 'n', 'r', 't', 'v' }
- ControlLetter ::= one of { 'a'..'z', 'A'..'Z' }
- IdentityEscape ::= (SourceCharacter but not IdentifierPart) | <ZWJ> | <ZWNJ>
- DecimalEscape ::= DecimalIntegerLiteral [lookahead not in DecimalDigit]
- CharacterClassEscape :: one of { 'd', 'D', 's', 'S', 'w', 'W' }
- CharacterClass ::= '[' [lookahead not in {'^'}] ClassRanges ']' | '[' '^' ClassRanges ']'
- ClassRanges ::= [empty] | NonemptyClassRanges
- NonemptyClassRanges ::= ClassAtom | ClassAtom NonemptyClassRangesNoDash | ClassAtom '-' ClassAtom ClassRanges
- NonemptyClassRangesNoDash ::= ClassAtom | ClassAtomNoDash NonemptyClassRangesNoDash | ClassAtomNoDash '-' ClassAtom ClassRanges
- ClassAtom ::= '-' | ClassAtomNoDash
- ClassAtomNoDash ::= SourceCharacter but not one of { '\', ']', '-' } | '\' ClassEscape
- ClassEscape ::= DecimalEscape | 'b' | CharacterEscape | CharacterClassEscape
- SourceCharacter ::= <unicode character>
- HexEscapeSequence ::= 'x' HexDigit HexDigit
- UnicodeEscapeSequence ::= 'u' HexDigit HexDigit HexDigit HexDigit | 'u' '{' HexDigits '}'
- HexDigit ::= one of { '0'..'9', 'a'..'f', 'A'..'F' }
- IdentifierStart ::= UnicodeLetter | '$' | '_' | '\' UnicodeEscapeSequence
- IdentifierPart ::= IdentifierStart | UnicodeCombiningMark | UnicodeDigit | UnicodeConnectorPunctuation | <ZWNJ> | <ZWJ>
- UnicodeLetter ::= <unicode Uppercase letter> | <unicode Lowercase letter> | <unicode Titlecase letter> | <unicode Modifier letter> | <unicode Other letter> | <unicode Letter number>
- UnicodeCombiningMark = <unicode Non-spacing mark> | <unicode combining spacing mark>
- UnicodeDigit ::= <unicode Decimal number>
- UnicodeConnectorPunctuation ::= <unicode Connector punctuation>
- DecimalIntegerLiteral ::= '0' | NonZeroDigit DecimalDigits?
- DecimalDigits ::= DecimalDigit | DecimalDigits DecimalDigit
- DecimalDigit ::= one of { '0'..'9' }
- NonZeroDigit ::= one of { '1'..'9' }
- ------------------
- Annex B Deviations
- ------------------
- 1. The assertions (?= ) and (?! ) are treated as though they have a surrounding non-capture group, and hence can be quantified.
- Other assertions are not quantifiable.
- QuantifiableAssertion [added] ::= '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
- Assertion [replaced] ::= '^' | '$' | '\' 'b' | '\' 'B' | QuantifiableAssertion
- Term ::= ... | QuantifiableAssertion Quantifier [added]
- --------------------------------------------------------------------------------------------------------------------------------
- Left factored
- --------------------------------------------------------------------------------------------------------------------------------
- Pattern ::= Disjunction
- Disjunction ::= Alternative ('|' Alternative)*
- FOLLOW(Disjunction) = { <eof>, ')' }
- Alternative ::= Term*
- FOLLOW(Alternative) = { <eof>, ')', '|' }
- Term ::= ( '^' | '$' | '\' 'b' | '\' 'B' | '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')' | Atom Quantifier? )
- | ( PatternCharacter | '.' | '\' AtomEscape | CharacterClass | '(' Disjunction ')' | '(' '?' ':' Disjunction ')' )
- ( '*' | '+' | '?' | '{' DecimalDigits (',' DecimalDigits?)? '}' ) '?'?
- PatternCharacter ::= <unicode character> but not any of { '^', '$', '\', '.', '*', '+', '?', '(', ')', '[', ']', '{', '}', '|' }
- AtomEscape ::= DecimalEscape | CharacterEscape | CharacterClassEscape
- CharacterEscape ::= ControlEscape | 'c' ControlLetter | HexEscapeSequence | UnicodeEscapeSequence | IdentityEscape
- ControlEscape ::= one of { 'f', 'n', 'r', 't', 'v' }
- ControlLetter ::= one of { 'a'..'z', 'A'..'Z' }
- IdentityEscape ::= <unicode character> but not <unicode Uppercase letter>, <unicode Lowercase letter>, <unicode Titlecase letter>, <unicode Modifier letter>, <unicode Other letter>, <unicode Letter number>, '$', '_', <unicode Non-spacing mark>, <unicode combining spacing mark>, <unicode Decimal number>, <unicode Connector punctuation>
- DecimalEscape ::= DecimalIntegerLiteral [lookahead not in DecimalDigit]
- CharacterClassEscape :: one of { 'd', 'D', 's', 'S', 'w', 'W' }
- CharacterClass ::= '[' [lookahead not in {'^'}] ClassRanges ']' | '[' '^' ClassRanges ']'
- ClassRanges ::= [empty] | NonemptyClassRanges
- NonemptyClassRanges ::= ClassAtom | ClassAtom NonemptyClassRangesNoDash | ClassAtom '-' ClassAtom ClassRanges
- NonemptyClassRangesNoDash ::= ClassAtom | ClassAtomNoDash NonemptyClassRangesNoDash | ClassAtomNoDash '-' ClassAtom ClassRanges
- ClassAtom ::= '-' | ClassAtomNoDash
- ClassAtomNoDash ::= SourceCharacter but not one of { '\', ']', '-' } | '\' ClassEscape
- ClassEscape ::= DecimalEscape | 'b' | CharacterEscape | CharacterClassEscape
- HexEscapeSequence ::= 'x' HexDigit{2}
- UnicodeEscapeSequence ::= 'u' HexDigit{4} | 'u' '{' HexDigits{4-6} '}'
- HexDigit ::= one of { '0'..'9', 'a'..'f', 'A'..'F' }
- DecimalIntegerLiteral ::= '0' | NonZeroDigit DecimalDigits?
- DecimalDigits ::= DecimalDigit+
- DecimalDigit ::= one of { '0'..'9' }
- NonZeroDigit ::= one of { '1'..'9' }
- ------------------
- Annex B Deviations
- ------------------
- QuantifiableAssertion [added] ::= '(' '?' '=' Disjunction ')' | '(' '?' '!' Disjunction ')'
- Term ::= ... | '(' '?' '=' Disjunction ')' [removed] | '(' '?' '!' Disjunction ')' [removed] | QuantifiableAssertion Quantifier? [added]
- */
- #include "ParserPch.h"
- namespace UnifiedRegex
- {
- ParseError::ParseError(bool isBody, CharCount pos, CharCount encodedPos, HRESULT error)
- : isBody(isBody), pos(pos), encodedPos(encodedPos), error(error)
- {
- }
- template <typename P, const bool IsLiteral>
- Parser<P, IsLiteral>::Parser
- ( Js::ScriptContext* scriptContext
- , ArenaAllocator* ctAllocator
- , StandardChars<EncodedChar>* standardEncodedChars
- , StandardChars<Char>* standardChars
- , bool isFromExternalSource
- #if ENABLE_REGEX_CONFIG_OPTIONS
- , DebugWriter* w
- #endif
- )
- : scriptContext(scriptContext)
- , ctAllocator(ctAllocator)
- , standardEncodedChars(standardEncodedChars)
- , standardChars(standardChars)
- #if ENABLE_REGEX_CONFIG_OPTIONS
- , w(w)
- #endif
- , input(0)
- , inputLim(0)
- , next(0)
- , inBody(false)
- , numGroups(1)
- , nextGroupId(1) // implicit overall group always takes index 0
- , litbuf(0)
- , litbufLen(0)
- , litbufNext(0)
- , surrogatePairList(nullptr)
- , currentSurrogatePairNode(nullptr)
- , tempLocationOfSurrogatePair(nullptr)
- , tempLocationOfRange(nullptr)
- , codePointAtTempLocation(0)
- , unicodeFlagPresent(false)
- , caseInsensitiveFlagPresent(false)
- , positionAfterLastSurrogate(nullptr)
- , valueOfLastSurrogate(INVALID_CODEPOINT)
- , deferredIfNotUnicodeError(nullptr)
- , deferredIfUnicodeError(nullptr)
- {
- if (isFromExternalSource)
- FromExternalSource();
- }
- //
- // Input buffer management
- //
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::SetPosition(const EncodedChar* input, const EncodedChar* inputLim, bool inBody)
- {
- this->input = input;
- this->inputLim = inputLim;
- next = input;
- this->inBody = inBody;
- RestoreMultiUnits(0);
- }
- template <typename P, const bool IsLiteral>
- inline CharCount Parser<P, IsLiteral>::Pos()
- {
- CharCount nextOffset = Chars<EncodedChar>::OSB(next, input);
- Assert(nextOffset >= m_cMultiUnits);
- return nextOffset - (CharCount)m_cMultiUnits;
- }
- template <typename P, const bool IsLiteral>
- inline bool Parser<P, IsLiteral>::IsEOF()
- {
- return next >= inputLim;
- }
- template <typename P, const bool IsLiteral>
- inline bool Parser<P, IsLiteral>::ECCanConsume(CharCount n = 1)
- {
- return next + n <= inputLim;
- }
- template <typename P, const bool IsLiteral>
- inline typename P::EncodedChar Parser<P, IsLiteral>::ECLookahead(CharCount n = 0)
- {
- // Ok to look ahead to terminating 0
- Assert(next + n <= inputLim);
- return next[n];
- }
- template <typename P, const bool IsLiteral>
- inline typename P::EncodedChar Parser<P, IsLiteral>::ECLookback(CharCount n = 0)
- {
- // Ok to look ahead to terminating 0
- Assert(n + input <= next);
- return *(next - n);
- }
- template <typename P, const bool IsLiteral>
- inline void Parser<P, IsLiteral>::ECConsume(CharCount n = 1)
- {
- Assert(next + n <= inputLim);
- #if DBG
- for (CharCount i = 0; i < n; i++)
- Assert(!IsMultiUnitChar(next[i]));
- #endif
- next += n;
- }
- template <typename P, const bool IsLiteral>
- inline void Parser<P, IsLiteral>::ECConsumeMultiUnit(CharCount n = 1)
- {
- Assert(next + n <= inputLim);
- next += n;
- }
- template <typename P, const bool IsLiteral>
- inline void Parser<P, IsLiteral>::ECRevert(CharCount n = 1)
- {
- Assert(n + input <= next);
- next -= n;
- }
- //
- // Helpers
- //
- template <typename P, const bool IsLiteral>
- int Parser<P, IsLiteral>::TryParseExtendedUnicodeEscape(Char& c, bool& previousSurrogatePart, bool trackSurrogatePair = false)
- {
- if (!scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
- {
- return 0;
- }
- if (!ECCanConsume(2) || ECLookahead(0) !='{' || !standardEncodedChars->IsHex(ECLookahead(1)))
- {
- return 0;
- }
- // The first character is mandatory to consume escape sequence, so we check for it above, at this stage we can set it as we already checked.
- codepoint_t codePoint = standardEncodedChars->DigitValue(ECLookahead(1));
- int i = 2;
- while(ECCanConsume(i + 1) && standardEncodedChars->IsHex(ECLookahead(i)))
- {
- codePoint <<= 4;
- codePoint += standardEncodedChars->DigitValue(ECLookahead(i));
- if (codePoint > 0x10FFFF)
- {
- return 0;
- }
- i++;
- }
- if(!ECCanConsume(i + 1) || ECLookahead(i) != '}')
- {
- return 0;
- }
- uint consumptionNumber = i + 1;
- Assert(consumptionNumber >= 3);
- if (!previousSurrogatePart && trackSurrogatePair && this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
- {
- // Current location
- TrackIfSurrogatePair(codePoint, (next - 1), consumptionNumber + 1);
- }
- wchar_t other;
- // Generally if this code point is a single character, then we take it and return.
- // If the character is made up of two characters then we emit the first and backtrack to the start of th escape sequence;
- // Following that we check if we have already seen the first character, and if so emit the second and consume the entire escape sequence.
- if (codePoint < 0x10000)
- {
- c = UTC(codePoint);
- ECConsumeMultiUnit(consumptionNumber);
- }
- else if (previousSurrogatePart)
- {
- previousSurrogatePart = false;
- Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &other, &c);
- ECConsumeMultiUnit(consumptionNumber);
- }
- else
- {
- previousSurrogatePart = true;
- Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &c, &other);
- Assert(ECLookback(1) == 'u' && ECLookback(2) == '\\');
- ECRevert(2);
- }
- return consumptionNumber;
- }
- // This function has the following 'knowledge':
- // - A codepoint that might be part of a surrogate pair or part of one
- // - A location where that codepoint is located
- // - Previously tracked part of surrogate pair (tempLocationOfSurrogatePair, and codePointAtTempLocation), which is always a surrogate lower part.
- // - A pointer to the current location of the linked list
- // - If a previous location is tracked, then it is of a parsed character (not given character) before current, and not same to current
- // This can't be asserted directly, and has to be followed by callers. Term pass can reset with each iteration, as well as this method in cases it needs.
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>:: TrackIfSurrogatePair(codepoint_t codePoint, const EncodedChar* location, uint32 consumptionLength)
- {
- Assert(codePoint < 0x110000);
- Assert(location != nullptr);
- Assert(location != this->tempLocationOfSurrogatePair);
- if (Js::NumberUtilities::IsSurrogateLowerPart(codePoint))
- {
- this->tempLocationOfSurrogatePair = location;
- this->codePointAtTempLocation = codePoint;
- }
- else
- {
- if(Js::NumberUtilities::IsSurrogateUpperPart(codePoint) && this->tempLocationOfSurrogatePair != nullptr)
- {
- Assert(Js::NumberUtilities::IsSurrogateLowerPart(codePointAtTempLocation));
- consumptionLength = (uint32)(location - this->tempLocationOfSurrogatePair) + consumptionLength;
- codePoint = Js::NumberUtilities::SurrogatePairAsCodePoint(codePointAtTempLocation, codePoint);
- location = this->tempLocationOfSurrogatePair;
- }
- // At this point we can clear previous location, and then if codePoint is bigger than 0xFFFF store it, as we either received it or combined it above
- this->tempLocationOfSurrogatePair = nullptr;
- this->codePointAtTempLocation = 0;
- }
- if (codePoint > 0xFFFF)
- {
- this->positionAfterLastSurrogate = location + consumptionLength;
- this->valueOfLastSurrogate = codePoint;
- // When parsing without AST we aren't given an allocator. In addition, only the 2 lines above are used during Pass 0;
- // while the bottom is used during Pass 1 (which isn't done when ParseNoAST)
- if(this->ctAllocator != nullptr)
- {
- SurrogatePairTracker* node = Anew(this->ctAllocator, SurrogatePairTracker, location, this->tempLocationOfRange, codePoint, consumptionLength, m_cMultiUnits);
- if (surrogatePairList == nullptr)
- {
- Assert(currentSurrogatePairNode == nullptr);
- surrogatePairList = node;
- currentSurrogatePairNode = node;
- }
- else
- {
- Assert(currentSurrogatePairNode != nullptr);
- currentSurrogatePairNode->next = node;
- currentSurrogatePairNode = node;
- }
- }
- }
- }
- template <typename P, const bool IsLiteral>
- Node* Parser<P, IsLiteral>::CreateSurrogatePairAtom(wchar_t lower, wchar_t upper)
- {
- MatchLiteralNode * literalNode = Anew(this->ctAllocator, MatchLiteralNode, 0, 0);
- MatchCharNode lowerNode(lower);
- MatchCharNode upperNode(upper);
- AccumLiteral(literalNode, &lowerNode);
- AccumLiteral(literalNode, &upperNode);
- return literalNode;
- }
- // This function will create appropriate pairs of ranges and add them to the disjunction node.
- // Terms used in comments:
- // - A minor codePoint is smaller than major codePoint, and both define a range of codePoints above 0x10000; to avoid confusion between lower/upper denoting codeUnits composing the surrogate pair.
- // - A boundary is a mod 0x400 alignment marker due to the nature of surrogate pairs representation. So the codepoint 0x10300 lies between boundaries 0x10000 and 0x10400.
- // - A prefix is the range set used to represent the values from minorCodePoint to the first boundary above minorCodePoint if applicable.
- // - A suffix is the range set used to represent the values from first boundary below majorCodePoint to the majorCodePoint if applicable.
- // - A full range is the range set used to represent the values from first boundary above minorCodePoint to first boundary below majorCodePoint if applicable.
- // The algorithm works as follows:
- // 1. Determine minorBoundary (minorCodePoint - mod 0x400 +0x400) and majorBoundary (majorCodePoint - mod 0x400). minorBoundary > minorCodePoint and majorBoundary < majorCodePoint
- // 2. Based on the codePoints and the boundaries, prefix, suffix, and full range is determined. Here are the rules:
- // 2-a. If minorBoundary > majorBoundary, we have an inner boundary range, output just that.
- // 2-b. If minorBoundary - 0x400u != minorCodepoint (i.e. codePoint doesn't lie right on a boundary to be part of a full range) we have a prefix.
- // 2-c. If majorBoundary + 0x3FFu != majorCodepoint (i.e. codePoint doesn't lie right before a boundary to be part of a full range) we have a suffix.
- // 2-d. We have a full range, if the two boundaries don't equal, OR the codePoints lie on the range boundaries opposite to what constitutes a prefix/suffix.
- // Visual representation for sample range 0x10300 - 0x10900
- // | [ _ | ^ ] |
- // 0x10000 0x10800 0x11000
- // [ ] - denote the actual range
- // _ - minorBoundary
- // ^ - majorBoundary
- // | - other boundaries
- // prefix is between [ and _
- // suffix is between ^ and ]
- // full range is between _ and ^
- template <typename P, const bool IsLiteral>
- AltNode* Parser<P, IsLiteral>::AppendSurrogateRangeToDisjunction(codepoint_t minorCodePoint, codepoint_t majorCodePoint, AltNode *lastAltNode)
- {
- Assert(minorCodePoint < majorCodePoint);
- Assert(minorCodePoint >= 0x10000u);
- Assert(majorCodePoint >= 0x10000u);
- Assert(scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() && unicodeFlagPresent);
- wchar_t lowerMinorCodeUnit, upperMinorCodeUnit, lowerMajorCodeUnit, upperMajorCodeUnit;
- Js::NumberUtilities::CodePointAsSurrogatePair(minorCodePoint, &lowerMinorCodeUnit, &upperMinorCodeUnit);
- Js::NumberUtilities::CodePointAsSurrogatePair(majorCodePoint, &lowerMajorCodeUnit, &upperMajorCodeUnit);
- // These boundaries represent whole range boundaries, as in 0x10000, 0x10400, 0x10800 etc
- // minor boundary is the first boundary strictly above minorCodePoint
- // major boundary is the first boundary below or equal to majorCodePoint
- codepoint_t minorBoundary = minorCodePoint - (minorCodePoint % 0x400u) + 0x400u;
- codepoint_t majorBoundary = majorCodePoint - (majorCodePoint % 0x400u);
- Assert(minorBoundary >= 0x10000);
- Assert(majorBoundary >= 0x10000);
- AltNode* tailToAdd = nullptr;
- // If the minor boundary is higher than major boundary, that means we have a range within the boundary and is less than 0x400
- // Ex: 0x10430 - 0x10700 will have minor boundary of 0x10800 and major of 0x10400
- // This pair will be represented in single range set.
- const bool singleRange = minorBoundary > majorBoundary;
- if (singleRange)
- {
- Assert(majorCodePoint - minorCodePoint < 0x400u);
- Assert(lowerMinorCodeUnit == lowerMajorCodeUnit);
- MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, lowerMinorCodeUnit);
- MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
- setNode->set.SetRange(ctAllocator, (Char)upperMinorCodeUnit, (Char)upperMajorCodeUnit);
- ConcatNode* concatNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, setNode, nullptr));
- tailToAdd = Anew(ctAllocator, AltNode, concatNode, nullptr);
- }
- else
- {
- Node* prefixNode = nullptr, *suffixNode = nullptr;
- const bool twoConsecutiveRanges = minorBoundary == majorBoundary;
- // For minorBoundary,
- if (minorBoundary - minorCodePoint == 1) // Single character in minor range
- {
- // The prefix is only a surrogate pair atom
- prefixNode = CreateSurrogatePairAtom(lowerMinorCodeUnit, upperMinorCodeUnit);
- }
- else if (minorCodePoint != minorBoundary - 0x400u) // Minor range isn't full
- {
- Assert(minorBoundary - minorCodePoint < 0x400u);
- MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, (Char)lowerMinorCodeUnit);
- MatchSetNode* upperSetNode = Anew(ctAllocator, MatchSetNode, false);
- upperSetNode->set.SetRange(ctAllocator, (Char)upperMinorCodeUnit, (Char)0xDFFFu);
- prefixNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, upperSetNode, nullptr));
- }
- else // Full minor range
- {
- minorBoundary -= 0x400u;
- }
- if (majorBoundary == majorCodePoint) // Single character in major range
- {
- // The suffix is only a surrogate pair atom
- suffixNode = CreateSurrogatePairAtom(lowerMajorCodeUnit, upperMajorCodeUnit);
- majorBoundary -= 0x400u;
- }
- else if (majorBoundary + 0x3FFu != majorCodePoint) // Major range isn't full
- {
- Assert(majorCodePoint - majorBoundary < 0x3FFu);
- MatchCharNode* lowerCharNode = Anew(ctAllocator, MatchCharNode, (Char)lowerMajorCodeUnit);
- MatchSetNode* upperSetNode = Anew(ctAllocator, MatchSetNode, false, false);
- upperSetNode->set.SetRange(ctAllocator, (Char)0xDC00u, (Char)upperMajorCodeUnit);
- suffixNode = Anew(ctAllocator, ConcatNode, lowerCharNode, Anew(ctAllocator, ConcatNode, upperSetNode, nullptr));
- majorBoundary -= 0x400u;
- }
- const bool nonFullConsecutiveRanges = twoConsecutiveRanges && prefixNode != nullptr && suffixNode != nullptr;
- if (nonFullConsecutiveRanges)
- {
- Assert(suffixNode != nullptr);
- Assert(minorCodePoint != minorBoundary - 0x400u);
- Assert(majorBoundary + 0x3FFu != majorCodePoint);
- // If the minor boundary is equal to major boundary, that means we have a cross boundary range that only needs 2 nodes for prefix/suffix.
- // We can only cross one boundary.
- Assert(majorCodePoint - minorCodePoint < 0x800u);
- tailToAdd = Anew(ctAllocator, AltNode, prefixNode, Anew(ctAllocator, AltNode, suffixNode, nullptr));
- }
- else
- {
- // We have 3 sets of ranges, comprising of prefix, full and suffix.
- Assert(majorCodePoint - minorCodePoint >= 0x400u);
- Assert((prefixNode != nullptr && suffixNode != nullptr) // Spanning more than two ranges
- || (prefixNode == nullptr && minorBoundary == minorCodePoint) // Two consecutive ranges and the minor is full
- || (suffixNode == nullptr && majorBoundary + 0x3FFu == majorCodePoint)); // Two consecutive ranges and the major is full
- Node* lowerOfFullRange;
- wchar_t lowerMinorBoundary, lowerMajorBoundary, ignore;
- Js::NumberUtilities::CodePointAsSurrogatePair(minorBoundary, &lowerMinorBoundary, &ignore);
- bool singleFullRange = majorBoundary == minorBoundary;
- if (singleFullRange)
- {
- // The lower part of the full range is simple a surrogate lower char
- lowerOfFullRange = Anew(ctAllocator, MatchCharNode, (Char)lowerMinorBoundary);
- }
- else
- {
- Js::NumberUtilities::CodePointAsSurrogatePair(majorBoundary, &lowerMajorBoundary, &ignore);
- MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
- setNode->set.SetRange(ctAllocator, (Char)lowerMinorBoundary, (Char)lowerMajorBoundary);
- lowerOfFullRange = setNode;
- }
- MatchSetNode* fullUpperRange = Anew(ctAllocator, MatchSetNode, false, false);
- fullUpperRange->set.SetRange(ctAllocator, (Char)0xDC00u, (Char)0xDFFFu);
- // These are added in the following order [full] [prefix][suffix]
- // This is doing by prepending, so in reverse.
- if (suffixNode != nullptr)
- {
- tailToAdd = Anew(ctAllocator, AltNode, suffixNode, tailToAdd);
- }
- if (prefixNode != nullptr)
- {
- tailToAdd = Anew(ctAllocator, AltNode, prefixNode, tailToAdd);
- }
- tailToAdd = Anew(ctAllocator, AltNode, Anew(ctAllocator, ConcatNode, lowerOfFullRange, Anew(ctAllocator, ConcatNode, fullUpperRange, nullptr)), tailToAdd);
- }
- }
- if (lastAltNode != nullptr)
- {
- Assert(lastAltNode->tail == nullptr);
- lastAltNode->tail = tailToAdd;
- }
- return tailToAdd;
- }
- template <typename P, const bool IsLiteral>
- AltNode* Parser<P, IsLiteral>::AppendSurrogatePairToDisjunction(codepoint_t codePoint, AltNode *lastAltNode)
- {
- wchar_t lower, upper;
- Js::NumberUtilities::CodePointAsSurrogatePair(codePoint, &lower, &upper);
- AltNode* tailNode = Anew(ctAllocator, AltNode, CreateSurrogatePairAtom(lower, upper), nullptr);
- if (lastAltNode != nullptr)
- {
- lastAltNode->tail = tailNode;
- }
- return tailNode;
- }
- //
- // Errors
- //
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::Fail(HRESULT error)
- {
- throw ParseError(inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
- }
- // This doesn't throw, but stores first error code for throwing later
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::DeferredFailIfUnicode(HRESULT error)
- {
- Assert(this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
- if (this->deferredIfUnicodeError == nullptr)
- {
- this->deferredIfUnicodeError = Anew(ctAllocator, ParseError, inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
- }
- }
- // This doesn't throw, but stores first error code for throwing later
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::DeferredFailIfNotUnicode(HRESULT error)
- {
- Assert(this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
- if (this->deferredIfNotUnicodeError == nullptr)
- {
- this->deferredIfNotUnicodeError = Anew(ctAllocator, ParseError, inBody, Pos(), Chars<EncodedChar>::OSB(next, input), error);
- }
- }
- template <typename P, const bool IsLiteral>
- inline void Parser<P, IsLiteral>::ECMust(EncodedChar ec, HRESULT error)
- {
- // We never look for 0
- Assert(ec != 0);
- if (ECLookahead() != ec)
- Fail(error);
- ECConsume();
- }
- template <typename P, const bool IsLiteral>
- inline wchar_t Parser<P, IsLiteral>::NextChar()
- {
- Assert(!IsEOF());
- // Could be an embedded 0
- Char c = ReadFull<true>(next, inputLim);
- // No embedded newlines in literals
- if (IsLiteral && standardChars->IsNewline(c))
- Fail(ERRnoSlash);
- return c;
- }
- //
- // Patterns/Disjunctions/Alternatives
- //
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::PatternPass0()
- {
- this->positionAfterLastSurrogate = nullptr;
- this->deferredIfNotUnicodeError = nullptr;
- this->deferredIfUnicodeError = nullptr;
- DisjunctionPass0(0);
- }
- template <typename P, const bool IsLiteral>
- Node* Parser<P, IsLiteral>::PatternPass1()
- {
- return DisjunctionPass1();
- }
- template <typename P, const bool IsLiteral>
- Node* Parser<P, IsLiteral>::UnionNodes(Node* prev, Node* curr)
- {
- if (prev->tag == Node::MatchChar)
- {
- MatchCharNode* prevChar = (MatchCharNode*)prev;
- if (curr->tag == Node::MatchChar)
- {
- MatchCharNode* currChar = (MatchCharNode*)curr;
- if (prevChar->cs[0] == currChar->cs[0])
- // Just ignore current node
- return prevChar;
- else
- {
- // Union chars into new set
- MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false);
- setNode->set.Set(ctAllocator, prevChar->cs[0]);
- setNode->set.Set(ctAllocator, currChar->cs[0]);
- return setNode;
- }
- }
- else if (curr->tag == Node::MatchSet)
- {
- MatchSetNode* currSet = (MatchSetNode*)curr;
- if (currSet->isNegation)
- // Can't merge
- return 0;
- else
- {
- // Union chars into new set
- MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false);
- setNode->set.Set(ctAllocator, prevChar->cs[0]) ;
- setNode->set.UnionInPlace(ctAllocator, currSet->set);
- return setNode;
- }
- }
- else
- // Can't merge
- return 0;
- }
- else if (prev->tag == Node::MatchSet)
- {
- MatchSetNode* prevSet = (MatchSetNode*)prev;
- if (prevSet->isNegation)
- // Can't merge
- return 0;
- else if (curr->tag == Node::MatchChar)
- {
- MatchCharNode* currChar = (MatchCharNode*)curr;
- // Include char in prev set
- prevSet->set.Set(ctAllocator, currChar->cs[0]);
- return prevSet;
- }
- else if (curr->tag == Node::MatchSet)
- {
- MatchSetNode* currSet = (MatchSetNode*)curr;
- if (currSet->isNegation)
- // Can't merge
- return 0;
- else
- {
- // Include chars in prev set
- prevSet->set.UnionInPlace(ctAllocator, currSet->set);
- return prevSet;
- }
- }
- else
- // Can't merge
- return 0;
- }
- else
- // Can't merge
- return 0;
- }
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::DisjunctionPass0(int depth)
- {
- AlternativePass0(depth);
- while (true)
- {
- // Could be terminating 0
- if (ECLookahead() != '|')
- return;
- ECConsume();
- AlternativePass0(depth);
- }
- }
- template <typename P, const bool IsLiteral>
- Node* Parser<P, IsLiteral>::DisjunctionPass1()
- {
- // Maintain the invariants:
- // - alt lists have two or more items
- // - alt list items are never alt lists (so we must inline them)
- // - an alt list never contains two consecutive match-character/match-set nodes
- // (so we must union consecutive items into a single set)
- Node* node = AlternativePass1();
- AltNode* last = 0;
- // First node may be an alternative
- if (node->tag == Node::Alt)
- {
- last = (AltNode*)node;
- while (last->tail != 0)
- last = last->tail;
- }
- while (true)
- {
- // Could be terminating 0
- if (ECLookahead() != '|')
- return node;
- ECConsume(); // '|'
- Node* next = AlternativePass1();
- AnalysisAssert(next != nullptr);
- Node* revisedPrev = UnionNodes(last == 0 ? node : last->head, next);
- if (revisedPrev != 0)
- {
- // Can merge next into previously seen alternative
- if (last == 0)
- node = revisedPrev;
- else
- last->head = revisedPrev;
- }
- else if (next->tag == Node::Alt)
- {
- AltNode* nextList = (AltNode*)next;
- // Append inner list to current list
- revisedPrev = UnionNodes(last == 0 ? node : last->head, nextList->head);
- if (revisedPrev != 0)
- {
- // Can merge head of list into previously seen alternative
- if (last ==0)
- node = revisedPrev;
- else
- last->head = revisedPrev;
- nextList = nextList->tail;
- }
- AnalysisAssert(nextList != nullptr);
- if (last == 0)
- node = Anew(ctAllocator, AltNode, node, nextList);
- else
- last->tail = nextList;
- while (nextList->tail != 0)
- nextList = nextList->tail;
- last = nextList;
- }
- else
- {
- // Append node
- AltNode* cons = Anew(ctAllocator, AltNode, next, 0);
- if (last == 0)
- node = Anew(ctAllocator, AltNode, node, cons);
- else
- last->tail = cons;
- last = cons;
- }
- }
- }
- template <typename P, const bool IsLiteral>
- inline bool Parser<P, IsLiteral>::IsEndOfAlternative()
- {
- EncodedChar ec = ECLookahead();
- // Could be terminating 0, but embedded 0 is part of alternative
- return (ec == 0 && IsEOF()) || ec == ')' || ec == '|' || (IsLiteral && ec == '/');
- }
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::EnsureLitbuf(CharCount size)
- {
- if (litbufLen - litbufNext < size)
- {
- CharCount newLen = max(litbufLen, initLitbufSize);
- while (newLen < litbufNext + size)
- newLen *= 2;
- litbuf = (Char*)ctAllocator->Realloc(litbuf, litbufLen * sizeof(Char), newLen * sizeof(Char));
- litbufLen = newLen;
- }
- }
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::AccumLiteral(MatchLiteralNode* deferredLiteralNode, Node* charOrLiteralNode)
- {
- Assert(charOrLiteralNode->tag == Node::MatchChar || charOrLiteralNode->tag == Node::MatchLiteral);
- CharCount addLen = charOrLiteralNode->LiteralLength();
- Assert(addLen > 0);
- if (deferredLiteralNode->length == 0)
- {
- // Start a new literal
- EnsureLitbuf(addLen);
- deferredLiteralNode->offset = litbufNext;
- deferredLiteralNode->length = addLen;
- charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
- }
- else if (deferredLiteralNode->offset + deferredLiteralNode->length == litbufNext)
- {
- // Keep growing the current literal
- EnsureLitbuf(addLen);
- charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
- deferredLiteralNode->length += addLen;
- }
- else if (charOrLiteralNode->tag == Node::MatchLiteral && deferredLiteralNode->offset + deferredLiteralNode->length == ((MatchLiteralNode*)charOrLiteralNode)->offset)
- {
- // Absorb next literal into current literal since they are adjacent
- deferredLiteralNode->length += addLen;
- }
- else
- {
- // Abandon current literal and start a fresh one (leaves gap)
- EnsureLitbuf(deferredLiteralNode->length + addLen);
- js_wmemcpy_s(litbuf + litbufNext, litbufLen - litbufNext, litbuf + deferredLiteralNode->offset, deferredLiteralNode->length);
- deferredLiteralNode->offset = litbufNext;
- litbufNext += deferredLiteralNode->length;
- charOrLiteralNode->AppendLiteral(litbufNext, litbufLen, litbuf);
- deferredLiteralNode->length += addLen;
- }
- }
- template <typename P, const bool IsLiteral>
- Node* Parser<P, IsLiteral>::FinalTerm(Node* node, MatchLiteralNode* deferredLiteralNode)
- {
- if (node == deferredLiteralNode)
- {
- #if DBG
- if (deferredLiteralNode->length == 0)
- Assert(false);
- #endif
- Assert(deferredLiteralNode->offset < litbufNext);
- Assert(deferredLiteralNode->offset + deferredLiteralNode->length <= litbufNext);
- if (deferredLiteralNode->length == 1)
- {
- node = Anew(ctAllocator, MatchCharNode, litbuf[deferredLiteralNode->offset]);
- if (deferredLiteralNode->offset + deferredLiteralNode->length == litbufNext)
- // Reclaim last added character
- litbufNext--;
- // else: leave a gap in the literal buffer
- }
- else
- node = Anew(ctAllocator, MatchLiteralNode, *deferredLiteralNode);
- deferredLiteralNode->offset = 0;
- deferredLiteralNode->length = 0;
- }
- return node;
- }
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::AlternativePass0(int depth)
- {
- while (!IsEndOfAlternative())
- TermPass0(depth);
- }
- template <typename P, const bool IsLiteral>
- Node* Parser<P, IsLiteral>::AlternativePass1()
- {
- if (IsEndOfAlternative())
- return Anew(ctAllocator, SimpleNode, Node::Empty);
- MatchCharNode deferredCharNode(0);
- MatchLiteralNode deferredLiteralNode(0, 0);
- // Maintain the invariants:
- // - concat lists have two or more items
- // - concat list items are never concat lists
- // - a concat list never contains two consecutive match-character/match-literal nodes
- bool previousSurrogatePart = false;
- Node* node = TermPass1(&deferredCharNode, previousSurrogatePart);
- AnalysisAssert(node != nullptr);
- ConcatNode* last = 0;
- // First node may be a concat
- if (node->tag == Node::Concat)
- {
- last = (ConcatNode*)node;
- while (last->tail != 0)
- last = last->tail;
- }
- if (last == 0)
- {
- if (node->LiteralLength() > 0)
- {
- // Begin a new literal
- AccumLiteral(&deferredLiteralNode, node);
- node = &deferredLiteralNode;
- }
- }
- else
- {
- if (last->head->LiteralLength() > 0)
- {
- // Begin a new literal
- AccumLiteral(&deferredLiteralNode, last->head);
- last->head = &deferredLiteralNode;
- }
- }
- while (!IsEndOfAlternative())
- {
- Node* next = TermPass1(&deferredCharNode, previousSurrogatePart);
- AnalysisAssert(next != nullptr);
- if (next->LiteralLength() > 0)
- {
- // Begin a new literal or grow the existing literal
- AccumLiteral(&deferredLiteralNode, next);
- if (last == 0)
- {
- if (node != &deferredLiteralNode)
- {
- // So far we have first item and the current literal
- ConcatNode* cons = Anew(ctAllocator, ConcatNode, &deferredLiteralNode, 0);
- node = Anew(ctAllocator, ConcatNode, node, cons);
- last = cons;
- }
- // else: keep growing first literal
- }
- else
- {
- if (last->head != &deferredLiteralNode)
- {
- // Append a new literal node
- ConcatNode* cons = Anew(ctAllocator, ConcatNode, &deferredLiteralNode, 0);
- last->tail = cons;
- last = cons;
- }
- // else: keep growing current literal
- }
- }
- else if (next->tag == Node::Concat)
- {
- // Append this list to accumulated list
- ConcatNode* nextList = (ConcatNode*)next;
- if (nextList->head->LiteralLength() > 0 &&
- ((last == 0 && node == &deferredLiteralNode) ||
- (last != 0 && last->head == &deferredLiteralNode)))
- {
- // Absorb the next character or literal into the current literal
- // (may leave a gab in litbuf)
- AccumLiteral(&deferredLiteralNode, nextList->head);
- nextList = nextList->tail;
- // List has at least two items
- AnalysisAssert(nextList != 0);
- // List should be in canonical form, so no consecutive chars/literals
- Assert(nextList->head->LiteralLength() == 0);
- }
- if (last == 0)
- node = Anew(ctAllocator, ConcatNode, FinalTerm(node, &deferredLiteralNode), nextList);
- else
- {
- last->head = FinalTerm(last->head, &deferredLiteralNode);
- last->tail = nextList;
- }
- while (nextList->tail != 0)
- nextList = nextList->tail;
- last = nextList;
- // No outstanding literals
- Assert(deferredLiteralNode.length == 0);
- if (last->head->LiteralLength() > 0)
- {
- // If the list ends with a literal, transfer it into deferredLiteralNode
- // so we can continue accumulating (won't leave a gab in litbuf)
- AccumLiteral(&deferredLiteralNode, last->head);
- // Can discard MatchLiteralNode since it lives in compile-time allocator
- last->head = &deferredLiteralNode;
- }
- }
- else
- {
- // Append this node to accumulated list
- ConcatNode* cons = Anew(ctAllocator, ConcatNode, next, 0);
- if (last == 0)
- node = Anew(ctAllocator, ConcatNode, FinalTerm(node, &deferredLiteralNode), cons);
- else
- {
- last->head = FinalTerm(last->head, &deferredLiteralNode);
- last->tail = cons;
- }
- last = cons;
- // No outstanding literals
- Assert(deferredLiteralNode.length == 0);
- }
- }
- if (last == 0)
- node = FinalTerm(node, &deferredLiteralNode);
- else
- last->head = FinalTerm(last->head, &deferredLiteralNode);
- // No outstanding literals
- Assert(deferredLiteralNode.length == 0);
- return node;
- }
- //
- // Terms
- //
- template <typename P, const bool IsLiteral>
- Node* Parser<P, IsLiteral>::NewLoopNode(CharCount lower, CharCountOrFlag upper, bool isGreedy, Node* body)
- {
- //
- // NOTE: We'd like to represent r? (i.e. r{0,1}) as r|<empty> since the loop representation has high overhead.
- // HOWEVER if r contains a group definition and could match empty then we must execute as a loop
- // so that group bindings are correctly reset on no progress (e.g.: /(a*)?/.exec("")). Thus we defer
- // this optimization until pass 4 of the optimizer, at which point we know whether r could match empty.
- //
- if (lower == 1 && upper == 1)
- return body;
- else if (lower == 0 && upper == 0)
- // Loop is equivalent to empty. If the loop body contains group definitions they will have already been
- // counted towards the overall number of groups. The matcher will initialize their contents to
- // undefined, and since the loop body would never execute the inner groups could never be updated from
- // undefined.
- return Anew(ctAllocator, SimpleNode, Node::Empty);
- else
- return Anew(ctAllocator, LoopNode, lower, upper, isGreedy, body);
- }
- template <typename P, const bool IsLiteral>
- bool Parser<P, IsLiteral>::AtQuantifier()
- {
- // Could be terminating 0
- switch (ECLookahead())
- {
- case '*':
- case '+':
- case '?':
- return true;
- case '{':
- {
- CharCount lookahead = 1;
- while (ECCanConsume(lookahead + 1) && standardEncodedChars->IsDigit(ECLookahead(lookahead)))
- lookahead++;
- if (lookahead == 1 || !ECCanConsume(lookahead + 1))
- return false;
- switch (ECLookahead(lookahead))
- {
- case ',':
- lookahead++;
- if (ECCanConsume(lookahead + 1) && ECLookahead(lookahead) == '}')
- return true;
- else
- {
- CharCount saved = lookahead;
- while (ECCanConsume(lookahead + 1) && standardEncodedChars->IsDigit(ECLookahead(lookahead)))
- lookahead++;
- if (lookahead == saved)
- return false;
- return ECCanConsume(lookahead + 1) && ECLookahead(lookahead) == '}';
- }
- case '}':
- return true;
- default:
- return false;
- }
- }
- default:
- return false;
- }
- }
- template <typename P, const bool IsLiteral>
- bool Parser<P, IsLiteral>::OptNonGreedy()
- {
- // Could be terminating 0
- if (ECLookahead() != '?')
- return true;
- ECConsume();
- return false;
- }
- template <typename P, const bool IsLiteral>
- CharCount Parser<P, IsLiteral>::RepeatCount()
- {
- CharCount n = 0;
- int digits = 0;
- while (true)
- {
- // Could be terminating 0
- EncodedChar ec = ECLookahead();
- if (!standardEncodedChars->IsDigit(ec))
- {
- if (digits == 0)
- Fail(JSERR_RegExpSyntax);
- return n;
- }
- if (n > MaxCharCount / 10)
- Fail(JSERR_RegExpSyntax);
- n *= 10;
- if (n > MaxCharCount - standardEncodedChars->DigitValue(ec))
- Fail(JSERR_RegExpSyntax);
- n += standardEncodedChars->DigitValue(ec);
- digits++;
- ECConsume();
- }
- }
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::TermPass0(int depth)
- {
- PROBE_STACK(scriptContext, Js::Constants::MinStackRegex);
- // Either we have a location at the start, or the end, never both. As in between it should have been cleared if surrogate pair
- // Or must be cleared if we didn't perform the check
- bool clearLocationIfPresent = this->tempLocationOfSurrogatePair != nullptr;
- switch (ECLookahead())
- {
- case '^':
- case '$':
- ECConsume();
- return;
- case '\\':
- ECConsume();
- if (AtomEscapePass0())
- return;
- break;
- case '(':
- // Can't combine into a single codeunit because of group present
- this->tempLocationOfSurrogatePair = nullptr;
- this->codePointAtTempLocation = 0;
- clearLocationIfPresent = false;
- ECConsume();
- switch (ECLookahead())
- {
- case '?':
- if (!ECCanConsume(2))
- Fail(JSERR_RegExpSyntax);
- switch (ECLookahead(1))
- {
- case '=':
- case '!':
- case ':':
- ECConsume(2);
- break;
- default:
- numGroups++;
- break;
- }
- break;
- default:
- numGroups++;
- break;
- }
- DisjunctionPass0(depth + 1);
- ECMust(')', JSERR_RegExpNoParen);
- break;
- case '.':
- ECConsume();
- break;
- case '[':
- ECConsume();
- this->tempLocationOfSurrogatePair = nullptr;
- this->codePointAtTempLocation = 0;
- this->tempLocationOfRange = next;
- CharacterClassPass0();
- this->tempLocationOfRange = nullptr;
- ECMust(']', JSERR_RegExpNoBracket);
- break;
- case ')':
- case '|':
- Fail(JSERR_RegExpSyntax);
- break;
- case ']':
- case '}':
- NextChar();
- break;
- case '*':
- case '+':
- case '?':
- case '{':
- if (AtQuantifier())
- Fail(JSERR_RegExpBadQuant);
- else
- ECConsume();
- break;
- case 0:
- if (IsEOF())
- // Terminating 0
- Fail(JSERR_RegExpSyntax);
- // else fall-through for embedded 0
- default:
- {
- const EncodedChar* current = next;
- Char c = NextChar();
- if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
- {
- TrackIfSurrogatePair(c, current, (uint32)(next - current));
- }
- // Closing '/' in literals should be caught explicitly
- Assert(!IsLiteral || c != '/');
- }
- break;
- }
- if (clearLocationIfPresent && this->tempLocationOfSurrogatePair != nullptr)
- {
- this->tempLocationOfSurrogatePair = nullptr;
- this->codePointAtTempLocation = 0;
- }
- if (AtQuantifier())
- {
- switch (ECLookahead())
- {
- case '*':
- case '+':
- case '?':
- ECConsume();
- OptNonGreedy();
- break;
- case '{':
- {
- ECConsume();
- CharCount lower = RepeatCount();
- switch (ECLookahead())
- {
- case ',':
- ECConsume();
- if (ECLookahead() == '}')
- {
- ECConsume();
- OptNonGreedy();
- }
- else
- {
- CharCount upper = RepeatCount();
- if (upper < lower)
- Fail(JSERR_RegExpSyntax);
- Assert(ECLookahead() == '}');
- ECConsume();
- OptNonGreedy();
- }
- break;
- case '}':
- ECConsume();
- OptNonGreedy();
- break;
- default:
- Assert(false);
- break;
- }
- break;
- }
- default:
- Assert(false);
- break;
- }
- }
- }
- template <typename P, const bool IsLiteral>
- Node* Parser<P, IsLiteral>::TermPass1(MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
- {
- PROBE_STACK(scriptContext, Js::Constants::MinStackRegex);
- Node* node = 0;
- bool containsSurrogatePair = false;
- switch (ECLookahead())
- {
- case '^':
- ECConsume();
- return Anew(ctAllocator, SimpleNode, Node::BOL); // No quantifier allowed
- case '$':
- ECConsume();
- return Anew(ctAllocator, SimpleNode, Node::EOL); // No quantifier allowed
- case '\\':
- ECConsume();
- if(SurrogatePairPass1(node, deferredCharNode, previousSurrogatePart))
- {
- break; // For quantifier
- }
- else if (AtomEscapePass1(node, deferredCharNode, previousSurrogatePart))
- {
- return node; // No quantifier allowed
- }
- break; // else: fall-through for opt quantifier
- case '(':
- ECConsume();
- switch (ECLookahead())
- {
- case '?':
- switch (ECLookahead(1))
- {
- case '=':
- ECConsume(2); // ?=
- node = DisjunctionPass1();
- Assert(ECLookahead() == ')');
- ECConsume(); // )
- node = Anew(ctAllocator, AssertionNode, false, node);
- break; // As per Annex B, allow this to be quantifiable
- case '!':
- ECConsume(2); // ?!
- node = DisjunctionPass1();
- Assert(ECLookahead() == ')');
- ECConsume(); // )
- node = Anew(ctAllocator, AssertionNode, true, node);
- break; // As per Annex B, allow this to be quantifiable
- case ':':
- ECConsume(2); // ?:
- node = DisjunctionPass1();
- Assert(ECLookahead() == ')');
- ECConsume(); // )
- break; // fall-through for opt quantifier
- default:
- {
- // ? not yet consumed
- int thisGroupId = nextGroupId++;
- node = DisjunctionPass1();
- Assert(ECLookahead() == ')');
- ECConsume(); // )
- node = Anew(ctAllocator, DefineGroupNode, thisGroupId, node);
- break; // fall-through for opt quantifier
- }
- }
- break;
- default:
- {
- // next char not yet consumed
- int thisGroupId = nextGroupId++;
- node = DisjunctionPass1();
- Assert(ECLookahead() == ')');
- ECConsume(); // )
- node = Anew(ctAllocator, DefineGroupNode, thisGroupId, node);
- break; // fall-through for opt quantifier
- }
- }
- break;
- case '.':
- {
- ECConsume();
- node = GetNodeWithValidCharacterSet('.');
- break; // fall-through for opt quantifier
- }
- case '[':
- ECConsume();
- if (unicodeFlagPresent)
- {
- containsSurrogatePair = this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->rangeLocation == next;
- }
- node = containsSurrogatePair ? CharacterClassPass1<true>() : CharacterClassPass1<false>();
- Assert(ECLookahead() == ']');
- ECConsume(); // ]
- break; // fall-through for opt quantifier
- #if DBG
- case ')':
- case '|':
- Assert(false);
- break;
- #endif
- case ']':
- // SPEC DEVIATION: This should be syntax error, instead accept as itself
- deferredCharNode->cs[0] = NextChar();
- node = deferredCharNode;
- break; // fall-through for opt quantifier
- #if DBG
- case '*':
- case '+':
- case '?':
- AssertMsg(false, "Allowed only in the escaped form. These should be caught by TermPass0.");
- break;
- #endif
- case 0:
- if (IsEOF())
- // Terminating 0
- Fail(JSERR_RegExpSyntax);
- // else fall-through for embedded 0
- default:
- if(SurrogatePairPass1(node, deferredCharNode, previousSurrogatePart))
- {
- break; //For quantifier
- }
- else
- {
- deferredCharNode->cs[0] = NextChar();
- node = deferredCharNode;
- break; // fall-through for opt quantifier
- }
- }
- Assert(node != 0);
- if (AtQuantifier())
- {
- switch (ECLookahead())
- {
- case '*':
- if (node == deferredCharNode)
- node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
- ECConsume();
- return NewLoopNode(0, CharCountFlag, OptNonGreedy(), node);
- case '+':
- if (node == deferredCharNode)
- node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
- ECConsume();
- return NewLoopNode(1, CharCountFlag, OptNonGreedy(), node);
- case '?':
- if (node == deferredCharNode)
- node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
- ECConsume();
- return NewLoopNode(0, 1, OptNonGreedy(), node);
- case '{':
- {
- if (node == deferredCharNode)
- node = Anew(ctAllocator, MatchCharNode, *deferredCharNode);
- ECConsume();
- CharCount lower = RepeatCount();
- switch (ECLookahead())
- {
- case ',':
- ECConsume();
- if (ECLookahead() == '}')
- {
- ECConsume();
- return NewLoopNode(lower, CharCountFlag, OptNonGreedy(), node);
- }
- else
- {
- CharCount upper = RepeatCount();
- Assert(lower <= upper);
- Assert(ECLookahead() == '}');
- ECConsume(); // }
- return NewLoopNode(lower, upper, OptNonGreedy(), node);
- }
- case '}':
- ECConsume();
- return NewLoopNode(lower, lower, OptNonGreedy(), node);
- default:
- Assert(false);
- break;
- }
- break;
- }
- default:
- Assert(false);
- break;
- }
- }
- return node;
- }
- #pragma warning(push)
- #pragma warning(disable:4702) // unreachable code
- template <typename P, const bool IsLiteral>
- bool Parser<P, IsLiteral>::AtomEscapePass0()
- {
- EncodedChar ec = ECLookahead();
- if (ec == 0 && IsEOF())
- {
- // Terminating 0
- Fail(JSERR_RegExpSyntax);
- return false;
- }
- else if (standardEncodedChars->IsDigit(ec))
- {
- do
- {
- ECConsume();
- }
- while (standardEncodedChars->IsDigit(ECLookahead())); // terminating 0 is not a digit
- return false;
- }
- else if (ECLookahead() == 'c')
- {
- if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
- ECConsume(2);
- return false;
- }
- else
- {
- const EncodedChar *current = next;
- // An escaped '/' is ok
- Char c = NextChar();
- switch (c)
- {
- case 'b':
- case 'B':
- return true;
- // case 'c': handled as special case above
- case 'x':
- if (ECCanConsume(2) &&
- standardEncodedChars->IsHex(ECLookahead(0)) &&
- standardEncodedChars->IsHex(ECLookahead(1)))
- ECConsume(2);
- break;
- case 'u':
- bool surrogateEncountered = false;
- int lengthOfSurrogate = TryParseExtendedUnicodeEscape(c, surrogateEncountered, true);
- if (lengthOfSurrogate > 0)
- {
- if (surrogateEncountered)
- {
- // If we don't have an allocator, we don't create nodes
- // Asserts in place as extra checks for when we do have an allocator
- Assert(this->ctAllocator == nullptr || this->currentSurrogatePairNode != nullptr);
- Assert(this->ctAllocator == nullptr || current == this->currentSurrogatePairNode->location);
- ECConsume(lengthOfSurrogate);
- }
- //Don't fall through
- break;
- }
- else if (ECCanConsume(4) &&
- standardEncodedChars->IsHex(ECLookahead(0)) &&
- standardEncodedChars->IsHex(ECLookahead(1)) &&
- standardEncodedChars->IsHex(ECLookahead(2)) &&
- standardEncodedChars->IsHex(ECLookahead(3)))
- {
- if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
- {
- codepoint_t value = (standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
- (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
- (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
- (standardEncodedChars->DigitValue(ECLookahead(3)));
- TrackIfSurrogatePair(value, (next - 1), 5);
- }
- ECConsume(4);
- }
- break;
- }
- // embedded 0 is ok
- return false;
- }
- }
- #pragma warning(pop)
- template <typename P, const bool IsLiteral>
- bool Parser<P, IsLiteral>::AtomEscapePass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
- {
- Assert(!IsEOF()); // checked for terminating 0 in pass 0
- if (standardEncodedChars->IsDigit(ECLookahead()))
- {
- // As per Annex B, allow octal escapes as well as group references, disambiguate based on known
- // number of groups.
- if (ECLookahead() == '0')
- {
- // fall through for octal
- }
- else
- {
- // Could be a group reference, but only if between 1 and 5 digits which resolve to a valid group number
- int n = 0;
- CharCount digits = 0;
- do
- {
- n = n * 10 + (int)standardEncodedChars->DigitValue(ECLookahead(digits));
- digits++;
- }
- while (digits < 5 && ECCanConsume(digits + 1) && standardEncodedChars->IsDigit(ECLookahead(digits)));
- if (n >= numGroups || ECCanConsume(digits + 1) && standardEncodedChars->IsDigit(ECLookahead(digits)))
- {
- if (standardEncodedChars->IsOctal(ECLookahead()))
- {
- // fall through for octal
- }
- else
- {
- // \8 and \9 are identity escapes
- deferredCharNode->cs[0] = Chars<EncodedChar>::CTW(ECLookahead());
- ECConsume();
- node = deferredCharNode;
- return false; // not an assertion
- }
- }
- else
- {
- ECConsume(digits);
- node = Anew(ctAllocator, MatchGroupNode, n);
- return false; // not an assertion
- }
- }
- // Must be between 1 and 3 octal digits
- Assert(standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not an octal
- uint n = 0;
- CharCount digits = 0;
- do
- {
- uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
- if (m > Chars<uint8>::MaxUChar) // Regex octal codes only support single byte (ASCII) characters.
- break;
- n = m;
- ECConsume();
- digits++;
- }
- while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not an octal
- deferredCharNode->cs[0] = UTC((UChar)n);
- node = deferredCharNode;
- return false; // not an assertion
- }
- else if (ECLookahead() == 'c')
- {
- Char c;
- if (standardEncodedChars->IsLetter(ECLookahead(1))) // terminating 0 is not a letter
- {
- c = UTC(Chars<EncodedChar>::CTU(ECLookahead(1)) % 32);
- ECConsume(2);
- }
- else
- {
- // SPEC DEVIATION: For non-letters or EOF, take the leading '\' to be itself, and
- // don't consume the 'c' or letter.
- c = '\\';
- }
- deferredCharNode->cs[0] = c;
- node = deferredCharNode;
- return false; // not an assertion
- }
- else
- {
- Char c = NextChar();
- switch (c)
- {
- case 'b':
- node = Anew(ctAllocator, WordBoundaryNode, false);
- return true; // Is an assertion
- case 'B':
- node = Anew(ctAllocator, WordBoundaryNode, true);
- return true; // Is an assertion
- case 'f':
- c = '\f';
- break; // fall-through for identity escape
- case 'n':
- c = '\n';
- break; // fall-through for identity escape
- case 'r':
- c = '\r';
- break; // fall-through for identity escape
- case 't':
- c = '\t';
- break; // fall-through for identity escape
- case 'v':
- c = '\v';
- break; // fall-through for identity escape
- case 'd':
- {
- MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
- standardChars->SetDigits(ctAllocator, setNode->set);
- node = setNode;
- return false; // not an assertion
- }
- case 'D':
- {
- node = GetNodeWithValidCharacterSet('D');
- return false; // not an assertion
- }
- case 's':
- {
- MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
- standardChars->SetWhitespace(ctAllocator, setNode->set);
- node = setNode;
- return false; // not an assertion
- }
- case 'S':
- {
- node = GetNodeWithValidCharacterSet('S');
- return false; // not an assertion
- }
- case 'w':
- {
- MatchSetNode *setNode = Anew(ctAllocator, MatchSetNode, false, false);
- standardChars->SetWordChars(ctAllocator, setNode->set);
- node = setNode;
- return false; // not an assertion
- }
- case 'W':
- {
- node = GetNodeWithValidCharacterSet('W');
- return false; // not an assertion
- }
- // case 'c': handled as special case above
- case 'x':
- if (ECCanConsume(2) &&
- standardEncodedChars->IsHex(ECLookahead(0)) &&
- standardEncodedChars->IsHex(ECLookahead(1)))
- {
- c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
- (standardEncodedChars->DigitValue(ECLookahead(1))));
- ECConsume(2);
- // fall-through for identity escape
- }
- // Take to be identity escape if ill-formed as per Annex B
- break;
- case 'u':
- if (unicodeFlagPresent && TryParseExtendedUnicodeEscape(c, previousSurrogatePart) > 0)
- break;
- else if (ECCanConsume(4) &&
- standardEncodedChars->IsHex(ECLookahead(0)) &&
- standardEncodedChars->IsHex(ECLookahead(1)) &&
- standardEncodedChars->IsHex(ECLookahead(2)) &&
- standardEncodedChars->IsHex(ECLookahead(3)))
- {
- c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
- (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
- (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
- (standardEncodedChars->DigitValue(ECLookahead(3))));
- ECConsume(4);
- // fall-through for identity escape
- }
- // Take to be identity escape if ill-formed as per Annex B
- break;
- default:
- // As per Annex B, allow anything other than newlines and above. Embedded 0 is ok
- break;
- }
- // Must be an identity escape
- deferredCharNode->cs[0] = c;
- node = deferredCharNode;
- return false; // not an assertion
- }
- }
- template <typename P, const bool IsLiteral>
- bool Parser<P, IsLiteral>::SurrogatePairPass1(Node*& node, MatchCharNode* deferredCharNode, bool& previousSurrogatePart)
- {
- if (!this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() || !unicodeFlagPresent)
- {
- return false;
- }
- if (this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
- {
- AssertMsg(!this->currentSurrogatePairNode->IsInsideRange(), "Should not be calling this pass if we are currently inside a range.");
- wchar_t lower, upper;
- uint tableIndex = 0, actualHigh = 0;
- codepoint_t equivClass[CaseInsensitive::EquivClassSize];
- if (caseInsensitiveFlagPresent && CaseInsensitive::RangeToEquivClass(tableIndex, this->currentSurrogatePairNode->value, this->currentSurrogatePairNode->value, actualHigh, equivClass))
- {
- Node *equivNode[CaseInsensitive::EquivClassSize];
- int indexForNextNode = 0;
- for (int i = 0; i < CaseInsensitive::EquivClassSize; i++)
- {
- bool alreadyAdded = false;
- for (int j = 0; j < i; j++)
- {
- if (equivClass[i] == equivClass[j])
- {
- alreadyAdded = true;
- break;
- }
- }
- if (!alreadyAdded)
- {
- if (Js::NumberUtilities::IsInSupplementaryPlane(equivClass[i]))
- {
- Js::NumberUtilities::CodePointAsSurrogatePair(equivClass[i], &lower, &upper);
- equivNode[indexForNextNode] = CreateSurrogatePairAtom(lower, upper);
- }
- else
- {
- equivNode[indexForNextNode] = Anew(ctAllocator, MatchCharNode, (Char)equivClass[i]);
- }
- indexForNextNode ++;
- }
- }
- Assert(indexForNextNode > 0);
- if (indexForNextNode == 1)
- {
- node = equivNode[0];
- }
- else
- {
- AltNode *altNode = Anew(ctAllocator, AltNode, equivNode[0], nullptr);
- AltNode *altNodeTail = altNode;
- for (int i = 1; i < indexForNextNode; i++)
- {
- altNodeTail->tail = Anew(ctAllocator, AltNode, equivNode[i], nullptr);
- altNodeTail = altNodeTail->tail;
- }
- node = altNode;
- }
- }
- else
- {
- Js::NumberUtilities::CodePointAsSurrogatePair(this->currentSurrogatePairNode->value, &lower, &upper);
- node = CreateSurrogatePairAtom(lower, upper);
- }
- previousSurrogatePart = false;
- Assert(ECCanConsume(this->currentSurrogatePairNode->length));
- ECConsumeMultiUnit(this->currentSurrogatePairNode->length);
- RestoreMultiUnits(this->currentSurrogatePairNode->multiUnits);
- this->currentSurrogatePairNode = this->currentSurrogatePairNode->next;
- return true;
- }
- return false;
- }
- //
- // Classes
- //
- template <typename P, const bool IsLiteral>
- bool Parser<P, IsLiteral>::AtSecondSingletonClassAtom()
- {
- Assert(ECLookahead() == '-');
- if (ECLookahead(1) == '\\')
- {
- switch (ECLookahead(2))
- {
- case 'd':
- case 'D':
- case 's':
- case 'S':
- case 'w':
- case 'W':
- // These all denote non-singleton sets
- return false;
- default:
- // fall-through for singleton
- break;
- }
- }
- return true;
- }
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::CharacterClassPass0()
- {
- // Could be terminating 0
- if (ECLookahead() == '^')
- ECConsume();
- EncodedChar nextChar = ECLookahead();
- const EncodedChar* current;
- codepoint_t lastCodepoint = INVALID_CODEPOINT;
- codepoint_t pendingRangeStart = INVALID_CODEPOINT;
- codepoint_t pendingRangeEnd = INVALID_CODEPOINT;
- bool previousSurrogatePart = false;
- while(nextChar != ']')
- {
- current = next;
- if (nextChar == '\0' && IsEOF())
- {
- // Report as unclosed '['
- Fail(JSERR_RegExpNoBracket);
- return;
- } // Otherwise embedded '\0' is ok
- else if (nextChar == '\\')
- {
- // Consume, as classescapepass0 expects for it to be consumed
- Char outChar = NextChar();
- // If previousSurrogatePart = true upon leaving this method, then we are going to pass through here twice
- // This is because \u{} escape sequence was encountered that is actually 2 characters, the second time we will pass consuming entire character
- if (ClassEscapePass0(outChar, previousSurrogatePart))
- {
- lastCodepoint = outChar;
- }
- else
- {
- // Last codepoint isn't a singleton, so no codepoint tracking for the sake of ranges is needed.
- lastCodepoint = INVALID_CODEPOINT;
- // Unless we have a possible range end, cancel our range tracking.
- if (pendingRangeEnd == INVALID_CODEPOINT)
- {
- pendingRangeStart = INVALID_CODEPOINT;
- }
- }
- }
- else if (nextChar == '-')
- {
- if (pendingRangeStart != INVALID_CODEPOINT || lastCodepoint == INVALID_CODEPOINT)
- {
- // '-' is the upper part of the range, with pendingRangeStart codepoint is the lower. Set lastCodePoint to '-' to check at the end of the while statement
- // OR
- // '-' is just a char, consume it and set as last char
- lastCodepoint = NextChar();
- }
- else
- {
- pendingRangeStart = this->next == this->positionAfterLastSurrogate
- ? this->valueOfLastSurrogate
- : lastCodepoint;
- lastCodepoint = INVALID_CODEPOINT;
- NextChar();
- }
- // If we have a pattern of the form [\ud800-\udfff], we need this to be interpreted as a range.
- // In order to achieve this, the two variables that we use to track surrogate pairs, namely
- // tempLocationOfSurrogatePair and previousSurrogatePart, need to be in a certain state.
- //
- // We need to reset tempLocationOfSurrogatePair as it points to the first Unicode escape (\ud800)
- // when we're here. We need to clear it in order not to have a surrogate pair when we process the
- // second escape (\udfff).
- //
- // previousSurrogatePart is used when we have a code point in the \u{...} extended format and the
- // character is in a supplementary plane. However, there is no need to change its value here. When
- // such an escape sequence is encountered, the first call to ClassEscapePass0() sets the variable
- // to true, but it rewinds the input back to the beginning of the escape sequence. The next
- // iteration of the loop here will again call ClassEscape0() with the same character and the
- // variable will this time be set to false. Therefore, the variable will always be false here.
- tempLocationOfSurrogatePair = nullptr;
- Assert(!previousSurrogatePart);
- }
- else
- {
- lastCodepoint = NextChar();
- if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
- {
- TrackIfSurrogatePair(lastCodepoint, current, (uint32)(next - current));
- }
- }
- if (!scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
- {
- // This is much easier to handle.
- if (pendingRangeStart != INVALID_CODEPOINT && lastCodepoint != INVALID_CODEPOINT)
- {
- Assert(pendingRangeStart < 0x10000);
- Assert(pendingRangeEnd == INVALID_CODEPOINT);
- if (pendingRangeStart > lastCodepoint)
- {
- Fail(JSERR_RegExpBadRange);
- }
- pendingRangeStart = lastCodepoint = INVALID_CODEPOINT;
- }
- }
- // We have a candidate for range end, and we have a range start.
- else if (pendingRangeStart != INVALID_CODEPOINT && (lastCodepoint != INVALID_CODEPOINT || pendingRangeEnd != INVALID_CODEPOINT))
- {
- // The following will be true at the end of each surrogate pair parse.
- // Note, the escape sequence \u{} is a two time parse, so this will be true on the second time around.
- if (this->next == this->positionAfterLastSurrogate)
- {
- lastCodepoint = this->valueOfLastSurrogate;
- Assert(!previousSurrogatePart);
- pendingRangeEnd = lastCodepoint;
- lastCodepoint = INVALID_CODEPOINT;
- }
- // If we the next character is the end of range ']', then we can't have a surrogate pair.
- // The current character is the range end, if we don't already have a candidate.
- else if (ECLookahead() == ']' && pendingRangeEnd == INVALID_CODEPOINT)
- {
- pendingRangeEnd = lastCodepoint;
- }
- //If we get here, and pendingRangeEnd is set. Then one of the above has caused it to be set, or the previous iteration of the loop.
- if (pendingRangeEnd != INVALID_CODEPOINT)
- {
- wchar_t leftSingleChar, rightSingleChar, ignore;
- if (pendingRangeStart >= 0x10000)
- {
- Js::NumberUtilities::CodePointAsSurrogatePair(pendingRangeStart, &ignore, &leftSingleChar);
- }
- else
- {
- leftSingleChar = (wchar_t)pendingRangeStart;
- }
- if (pendingRangeEnd >= 0x10000)
- {
- Js::NumberUtilities::CodePointAsSurrogatePair(pendingRangeEnd, &rightSingleChar, &ignore);
- }
- else
- {
- rightSingleChar = (wchar_t)pendingRangeEnd;
- }
- // Here it is a bit tricky, we don't know if we have a unicode option specified.
- // If it is, then \ud800\udc00 - \ud800\udc01 is valid, otherwise invalid.
- if (pendingRangeStart < 0x10000 && pendingRangeEnd < 0x10000 && pendingRangeStart > pendingRangeEnd)
- {
- Fail(JSERR_RegExpBadRange);
- }
- else
- {
- if(leftSingleChar > rightSingleChar)
- {
- DeferredFailIfNotUnicode(JSERR_RegExpBadRange);
- }
- if (pendingRangeStart > pendingRangeEnd)
- {
- DeferredFailIfUnicode(JSERR_RegExpBadRange);
- }
- }
- pendingRangeStart = pendingRangeEnd = INVALID_CODEPOINT;
- }
- // The current char < 0x10000 is a candidate for the range end, but we need to iterate one more time.
- else
- {
- pendingRangeEnd = lastCodepoint;
- }
- }
- nextChar = ECLookahead();
- }
- // We should never have a pendingRangeEnd set when we exit the loop
- Assert(pendingRangeEnd == INVALID_CODEPOINT);
- }
- template <typename P, const bool IsLiteral>
- template <bool containsSurrogates>
- Node* Parser<P, IsLiteral>::CharacterClassPass1()
- {
- Assert(containsSurrogates ? unicodeFlagPresent : true);
- CharSet<codepoint_t> codePointSet;
- MatchSetNode deferredSetNode(false, false);
- MatchCharNode deferredCharNode(0);
- bool isNegation = false;
- if (ECLookahead() == '^')
- {
- isNegation = true;
- ECConsume();
- }
- // We aren't expecting any terminating null characters, only embedded ones that should treated as valid characters.
- // CharacterClassPass0 should have taken care of terminating null.
- codepoint_t pendingCodePoint = INVALID_CODEPOINT;
- codepoint_t pendingRangeStart = INVALID_CODEPOINT;
- EncodedChar nextChar = ECLookahead();
- bool previousWasASurrogate = false;
- while(nextChar != ']')
- {
- codepoint_t codePointToSet = INVALID_CODEPOINT;
- // Consume ahead of time if we have two backslashes, both cases below (previously Tracked surrogate pair, and ClassEscapePass1) assume it is.
- if (nextChar == '\\')
- {
- ECConsume();
- }
- // These if-blocks are the logical ClassAtomPass1, they weren't grouped into a method to simplify dealing with multiple out parameters.
- if (containsSurrogates && this->currentSurrogatePairNode != nullptr && this->currentSurrogatePairNode->location == this->next)
- {
- codePointToSet = pendingCodePoint;
- pendingCodePoint = this->currentSurrogatePairNode->value;
- Assert(ECCanConsume(this->currentSurrogatePairNode->length));
- ECConsumeMultiUnit(this->currentSurrogatePairNode->length);
- RestoreMultiUnits(this->currentSurrogatePairNode->multiUnits);
- this->currentSurrogatePairNode = this->currentSurrogatePairNode->next;
- }
- else if (nextChar == '\\')
- {
- Node* returnedNode = ClassEscapePass1(&deferredCharNode, &deferredSetNode, previousWasASurrogate);
- if (returnedNode->tag == Node::MatchSet)
- {
- codePointToSet = pendingCodePoint;
- pendingCodePoint = INVALID_CODEPOINT;
- if (pendingRangeStart != INVALID_CODEPOINT)
- {
- codePointSet.Set(ctAllocator, '-');
- }
- pendingRangeStart = INVALID_CODEPOINT;
- codePointSet.UnionInPlace(ctAllocator, deferredSetNode.set);
- }
- else
- {
- // Just a character
- codePointToSet = pendingCodePoint;
- pendingCodePoint = deferredCharNode.cs[0];
- }
- }
- else if (nextChar == '-')
- {
- if (pendingRangeStart != INVALID_CODEPOINT || pendingCodePoint == INVALID_CODEPOINT || ECLookahead(1) == ']')
- {
- // - is just a char, or end of a range.
- codePointToSet = pendingCodePoint;
- pendingCodePoint = '-';
- ECConsume();
- }
- else
- {
- pendingRangeStart = pendingCodePoint;
- ECConsume();
- }
- }
- else
- {
- // Just a character, consume it
- codePointToSet = pendingCodePoint;
- pendingCodePoint = NextChar();
- }
- if (codePointToSet != INVALID_CODEPOINT)
- {
- if (pendingRangeStart != INVALID_CODEPOINT)
- {
- if (pendingRangeStart > pendingCodePoint)
- {
- //We have no unicodeFlag, but current range contains surrogates, thus we may end up having to throw a "Syntax" error here
- //This breaks the notion of Pass0 check for valid syntax, because we don't know if we have a unicode option
- Assert(!unicodeFlagPresent);
- Fail(JSERR_RegExpBadRange);
- }
- codePointSet.SetRange(ctAllocator, pendingRangeStart, pendingCodePoint);
- pendingRangeStart = pendingCodePoint = INVALID_CODEPOINT;
- }
- else
- {
- codePointSet.Set(ctAllocator, codePointToSet);
- }
- }
- nextChar = ECLookahead();
- }
- if (pendingCodePoint != INVALID_CODEPOINT)
- {
- codePointSet.Set(ctAllocator, pendingCodePoint);
- }
- // At this point, we have a complete set of codepoints representing the range.
- // Before performing translation of any kind, we need to do some case filling.
- // At the point of this comment, there are no case mappings going cross-plane between simple
- // characters (< 0x10000) and supplementary characters (>= 0x10000)
- // However, it might still be the case, and this has to be handled.
- // On the other hand, we don't want to prevent optimizations that expect non-casefolded sets from happening.
- // At least for simple characters.
- // The simple case, is when the unicode flag isn't specified, we can go ahead and return the simple set.
- // Negations and case mappings will be handled later.
- if (!unicodeFlagPresent)
- {
- Assert(codePointSet.SimpleCharCount() == codePointSet.Count());
- MatchSetNode *simpleToReturn = Anew(ctAllocator, MatchSetNode, isNegation);
- codePointSet.CloneSimpleCharsTo(ctAllocator, simpleToReturn->set);
- return simpleToReturn;
- }
- // Everything past here must be under the flag
- Assert(scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
- if (codePointSet.IsEmpty())
- {
- return Anew(ctAllocator, MatchSetNode, false, false);
- }
- Node* prefixNode = nullptr;
- Node* suffixNode = nullptr;
- CharSet<codepoint_t> *toUseForTranslation = &codePointSet;
- // If a singleton, return a simple character
- bool isSingleton = !this->caseInsensitiveFlagPresent && !isNegation && codePointSet.IsSingleton();
- if (isSingleton)
- {
- codepoint_t singleton = codePointSet.Singleton();
- Node* toReturn = nullptr;
- if (singleton < 0x10000)
- {
- toReturn = Anew(ctAllocator, MatchCharNode, (wchar_t)singleton);
- }
- else
- {
- Assert(unicodeFlagPresent);
- wchar_t lowerSurrogate, upperSurrogate;
- Js::NumberUtilities::CodePointAsSurrogatePair(singleton, &lowerSurrogate, &upperSurrogate);
- toReturn = CreateSurrogatePairAtom(lowerSurrogate, upperSurrogate);
- }
- codePointSet.Clear(ctAllocator);
- return toReturn;
- }
- if (!this->caseInsensitiveFlagPresent)
- {
- // If negation, we want to complement the simple chars.
- // When a set is negated, optimizations skip checking if applicable, so we can go ahead and negate it here.
- CharSet<codepoint_t> negatedSet;
- if (isNegation)
- {
- // Complement all characters, and use it as the set toTranslate
- codePointSet.ToComplement(ctAllocator, negatedSet);
- }
- toUseForTranslation = isNegation ? &negatedSet : &codePointSet;
- if (isNegation)
- {
- // Clear this, as we will no longer need this.
- codePointSet.FreeBody(ctAllocator);
- }
- }
- else
- {
- CharSet<codepoint_t> caseEquivalent;
- codePointSet.ToEquivClass(ctAllocator, caseEquivalent);
- // Equiv set can't have a reduced count of chars
- Assert(caseEquivalent.Count() >= codePointSet.Count());
- // Here we have a regex that has both case insensitive and unicode options.
- // The range might also be negated. If it is negated, we can go ahead and negate
- // the entire set as well as fill in cases, as optimizations wouldn't kick in anyways.
- if (isNegation)
- {
- codePointSet.Clear(ctAllocator);
- caseEquivalent.ToComplement(ctAllocator, codePointSet);
- caseEquivalent.FreeBody(ctAllocator);
- }
- else
- {
- codePointSet.CloneFrom(ctAllocator, caseEquivalent);
- }
- Assert(toUseForTranslation == &codePointSet);
- }
- uint totalCodePointsCount = toUseForTranslation->Count();
- uint simpleCharsCount = toUseForTranslation->SimpleCharCount();
- if (totalCodePointsCount == simpleCharsCount)
- {
- MatchSetNode *simpleToReturn = Anew(ctAllocator, MatchSetNode, isNegation);
- toUseForTranslation->CloneSimpleCharsTo(ctAllocator, simpleToReturn->set);
- return simpleToReturn;
- }
- if (simpleCharsCount > 0)
- {
- if (!toUseForTranslation->ContainSurrogateCodeUnits())
- {
- MatchSetNode *node = Anew(ctAllocator, MatchSetNode, false, false);
- toUseForTranslation->CloneSimpleCharsTo(ctAllocator, node->set);
- prefixNode = node;
- }
- else
- {
- MatchSetNode *node = Anew(ctAllocator, MatchSetNode, false, false);
- toUseForTranslation->CloneNonSurrogateCodeUnitsTo(ctAllocator, node->set);
- prefixNode = node;
- node = Anew(ctAllocator, MatchSetNode, false, false);
- toUseForTranslation->CloneSurrogateCodeUnitsTo(ctAllocator, node->set);
- suffixNode = node;
- }
- }
- Assert(unicodeFlagPresent);
- AltNode *headToReturn = prefixNode == nullptr ? nullptr : Anew(ctAllocator, AltNode, prefixNode, nullptr);
- AltNode *currentTail = headToReturn;
- codepoint_t charRangeSearchIndex = 0x10000, lowerCharOfRange = 0, upperCharOfRange = 0;
- while (toUseForTranslation->GetNextRange(charRangeSearchIndex, &lowerCharOfRange, &upperCharOfRange))
- {
- if (lowerCharOfRange == upperCharOfRange)
- {
- currentTail = this->AppendSurrogatePairToDisjunction(lowerCharOfRange, currentTail);
- }
- else
- {
- currentTail = this->AppendSurrogateRangeToDisjunction(lowerCharOfRange, upperCharOfRange, currentTail);
- }
- if (headToReturn == nullptr)
- {
- headToReturn = currentTail;
- }
- AnalysisAssert(currentTail != nullptr);
- while (currentTail->tail != nullptr)
- {
- currentTail = currentTail->tail;
- }
- charRangeSearchIndex = upperCharOfRange + 1;
- }
- if (suffixNode != nullptr)
- {
- currentTail->tail = Anew(ctAllocator, AltNode, suffixNode, nullptr);
- }
- toUseForTranslation->Clear(ctAllocator);
- if (headToReturn != nullptr && headToReturn->tail == nullptr)
- {
- return headToReturn->head;
- }
- return headToReturn;
- }
- #pragma warning(push)
- #pragma warning(disable:4702) // unreachable code
- template <typename P, const bool IsLiteral>
- bool Parser<P, IsLiteral>::ClassEscapePass0(Char& singleton, bool& previousSurrogatePart)
- {
- // Could be terminating 0
- EncodedChar ec = ECLookahead();
- if (ec == 0 && IsEOF())
- {
- Fail(JSERR_RegExpSyntax);
- return false;
- }
- else if (standardEncodedChars->IsOctal(ec))
- {
- uint n = 0;
- CharCount digits = 0;
- do
- {
- uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
- if (m > Chars<uint8>::MaxUChar) //Regex octal codes only support single byte (ASCII) characters.
- break;
- n = m;
- ECConsume();
- digits++;
- }
- while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not octal
- singleton = UTC((UChar)n);
- // Clear possible pair
- this->tempLocationOfSurrogatePair = nullptr;
- return true;
- }
- else
- {
- const EncodedChar* location = this->tempLocationOfSurrogatePair;
- // Clear it for now, otherwise to many branches to clear it on.
- this->tempLocationOfSurrogatePair = nullptr;
- // An escaped '/' is ok
- Char c = NextChar();
- switch (c)
- {
- case 'b':
- singleton = '\b';
- return true;
- case 'f':
- singleton = '\f';
- return true;
- case 'n':
- singleton = '\n';
- return true;
- case 'r':
- singleton = '\r';
- return true;
- case 't':
- singleton = '\t';
- return true;
- case 'v':
- singleton = '\v';
- return true;
- case 'd':
- case 'D':
- case 's':
- case 'S':
- case 'w':
- case 'W':
- return false;
- case 'c':
- if (standardEncodedChars->IsLetter(ECLookahead())) // terminating 0 is not a letter
- {
- singleton = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
- ECConsume();
- }
- else
- {
- if (!IsEOF())
- {
- EncodedChar ec = ECLookahead();
- switch (ec)
- {
- case '-':
- case ']':
- singleton = c;
- break;
- default:
- singleton = UTC(Chars<EncodedChar>::CTU(ec) % 32);
- ECConsume();
- break;
- }
- }
- else
- singleton = c;
- }
- return true;
- case 'x':
- if (ECCanConsume(2) &&
- standardEncodedChars->IsHex(ECLookahead(0)) &&
- standardEncodedChars->IsHex(ECLookahead(1)))
- {
- singleton = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
- (standardEncodedChars->DigitValue(ECLookahead(1))));
- ECConsume(2);
- }
- else
- singleton = c;
- return true;
- case 'u':
- this->tempLocationOfSurrogatePair = location;
- if (this->TryParseExtendedUnicodeEscape(singleton, previousSurrogatePart, true) > 0)
- return true;
- else if (ECCanConsume(4) &&
- standardEncodedChars->IsHex(ECLookahead(0)) &&
- standardEncodedChars->IsHex(ECLookahead(1)) &&
- standardEncodedChars->IsHex(ECLookahead(2)) &&
- standardEncodedChars->IsHex(ECLookahead(3)))
- {
- singleton = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
- (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
- (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
- (standardEncodedChars->DigitValue(ECLookahead(3))));
- if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
- {
- // Current location
- TrackIfSurrogatePair(singleton, (next - 1), 5);
- }
- // The above if statement, if true, will clear tempLocationOfSurrogatePair if needs to.
- ECConsume(4);
- }
- else
- singleton = c;
- return true;
- default:
- // embedded 0 is ok
- singleton = c;
- return true;
- }
- }
- }
- #pragma warning(pop)
- template <typename P, const bool IsLiteral>
- Node* Parser<P, IsLiteral>::ClassEscapePass1(MatchCharNode* deferredCharNode, MatchSetNode* deferredSetNode, bool& previousSurrogatePart)
- {
- // Checked for terminating 0 is pass 0
- Assert(!IsEOF());
- if (standardEncodedChars->IsOctal(ECLookahead()))
- {
- // As per Annex B, allow octal escapes instead of just \0 (and \8 and \9 are identity escapes).
- // Must be between 1 and 3 octal digits.
- uint n = 0;
- CharCount digits = 0;
- do
- {
- uint m = n * 8 + standardEncodedChars->DigitValue(ECLookahead());
- if (m > Chars<uint8>::MaxUChar) //Regex octal codes only support single byte (ASCII) characters.
- break;
- n = m;
- ECConsume();
- digits++;
- }
- while (digits < 3 && standardEncodedChars->IsOctal(ECLookahead())); // terminating 0 is not octal
- deferredCharNode->cs[0] = UTC((UChar)n);
- return deferredCharNode;
- }
- else
- {
- Char c = NextChar();
- switch (c)
- {
- case 'b':
- c = '\b';
- break; // fall-through for identity escape
- case 'f':
- c = '\f';
- break; // fall-through for identity escape
- case 'n':
- c = '\n';
- break; // fall-through for identity escape
- case 'r':
- c = '\r';
- break; // fall-through for identity escape
- case 't':
- c = '\t';
- break; // fall-through for identity escape
- case 'v':
- c = '\v';
- break; // fall-through for identity escape
- case 'd':
- standardChars->SetDigits(ctAllocator, deferredSetNode->set);
- return deferredSetNode;
- case 'D':
- standardChars->SetNonDigits(ctAllocator, deferredSetNode->set);
- return deferredSetNode;
- case 's':
- standardChars->SetWhitespace(ctAllocator, deferredSetNode->set);
- return deferredSetNode;
- case 'S':
- standardChars->SetNonWhitespace(ctAllocator, deferredSetNode->set);
- return deferredSetNode;
- case 'w':
- standardChars->SetWordChars(ctAllocator, deferredSetNode->set);
- return deferredSetNode;
- case 'W':
- standardChars->SetNonWordChars(ctAllocator, deferredSetNode->set);
- return deferredSetNode;
- case 'c':
- if (standardEncodedChars->IsLetter(ECLookahead())) // terminating 0 is not a letter
- {
- c = UTC(Chars<EncodedChar>::CTU(ECLookahead()) % 32);
- ECConsume();
- // fall-through for identity escape
- }
- else
- {
- // SPEC DEVIATION: For non-letters, still take lower 5 bits, e.g. [\c1] == [\x11].
- // However, '-', ']', and EOF make the \c just a 'c'.
- if (!IsEOF())
- {
- EncodedChar ec = ECLookahead();
- switch (ec)
- {
- case '-':
- case ']':
- // fall-through for identity escape with 'c'
- break;
- default:
- c = UTC(Chars<EncodedChar>::CTU(ec) % 32);
- ECConsume();
- // fall-through for identity escape
- break;
- }
- }
- // else: fall-through for identity escape with 'c'
- }
- break;
- case 'x':
- if (ECCanConsume(2) &&
- standardEncodedChars->IsHex(ECLookahead(0)) &&
- standardEncodedChars->IsHex(ECLookahead(1)))
- {
- c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 4) |
- (standardEncodedChars->DigitValue(ECLookahead(1))));
- ECConsume(2);
- // fall-through for identity escape
- }
- // Take to be identity escape if ill-formed as per Annex B
- break;
- case 'u':
- if (unicodeFlagPresent && TryParseExtendedUnicodeEscape(c, previousSurrogatePart) > 0)
- break;
- else if (ECCanConsume(4) &&
- standardEncodedChars->IsHex(ECLookahead(0)) &&
- standardEncodedChars->IsHex(ECLookahead(1)) &&
- standardEncodedChars->IsHex(ECLookahead(2)) &&
- standardEncodedChars->IsHex(ECLookahead(3)))
- {
- c = UTC((standardEncodedChars->DigitValue(ECLookahead(0)) << 12) |
- (standardEncodedChars->DigitValue(ECLookahead(1)) << 8) |
- (standardEncodedChars->DigitValue(ECLookahead(2)) << 4) |
- (standardEncodedChars->DigitValue(ECLookahead(3))));
- ECConsume(4);
- // fall-through for identity escape
- }
- // Take to be identity escape if ill-formed as per Annex B.
- break;
- default:
- // As per Annex B, allow anything other than newlines and above. Embedded 0 is ok.
- break;
- }
- // Must be an identity escape
- deferredCharNode->cs[0] = c;
- return deferredCharNode;
- }
- }
- //
- // Options
- //
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::Options(RegexFlags& flags)
- {
- while (true)
- {
- // Could be terminating 0
- EncodedChar ec = ECLookahead();
- CharCount consume;
- Char c;
- if (ec == 0)
- // Embedded 0 not valid
- return;
- else if (IsLiteral &&
- ec == '\\' &&
- ECCanConsume(6) &&
- ECLookahead(1) == 'u' &&
- standardEncodedChars->IsHex(ECLookahead(2)) &&
- standardEncodedChars->IsHex(ECLookahead(3)) &&
- standardEncodedChars->IsHex(ECLookahead(4)) &&
- standardEncodedChars->IsHex(ECLookahead(5)))
- {
- if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
- {
- Fail(JSERR_RegExpSyntax);
- return;
- }
- else
- {
- uint32 n = (standardEncodedChars->DigitValue(ECLookahead(2)) << 12) |
- (standardEncodedChars->DigitValue(ECLookahead(3)) << 8) |
- (standardEncodedChars->DigitValue(ECLookahead(4)) << 4) |
- (standardEncodedChars->DigitValue(ECLookahead(5)));
- c = UTC(n);
- consume = 6;
- }
- }
- else
- {
- c = Chars<EncodedChar>::CTW(ec);
- consume = 1;
- }
- switch (c) {
- case 'i':
- if ((flags & IgnoreCaseRegexFlag) != 0)
- {
- Fail(JSERR_RegExpSyntax);
- }
- flags = (RegexFlags)(flags | IgnoreCaseRegexFlag);
- break;
- case 'g':
- if ((flags & GlobalRegexFlag) != 0)
- {
- Fail(JSERR_RegExpSyntax);
- }
- flags = (RegexFlags)(flags | GlobalRegexFlag);
- break;
- case 'm':
- if ((flags & MultilineRegexFlag) != 0)
- {
- Fail(JSERR_RegExpSyntax);
- }
- flags = (RegexFlags)(flags | MultilineRegexFlag);
- break;
- case 'u':
- // If we don't have unicode enabled, fall through to default
- if (scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled())
- {
- if ((flags & UnicodeRegexFlag) != 0)
- {
- Fail(JSERR_RegExpSyntax);
- }
- flags = (RegexFlags)(flags | UnicodeRegexFlag);
- // For telemetry
- CHAKRATEL_LANGSTATS_INC_LANGFEATURECOUNT(UnicodeRegexFlagCount, scriptContext);
- break;
- }
- case 'y':
- if (scriptContext->GetConfig()->IsES6RegExStickyEnabled())
- {
- if ((flags & StickyRegexFlag) != 0)
- {
- Fail(JSERR_RegExpSyntax);
- }
- flags = (RegexFlags)(flags | StickyRegexFlag);
- // For telemetry
- CHAKRATEL_LANGSTATS_INC_LANGFEATURECOUNT(StickyRegexFlagCount, scriptContext);
- break;
- }
- default:
- if (standardChars->IsWord(c))
- {
- // Outer context could never parse this character. Signal the syntax error as
- // being part of the regex.
- Fail(JSERR_RegExpSyntax);
- }
- return;
- }
- ECConsume(consume);
- }
- }
- //
- // Entry points
- //
- template <typename P, const bool IsLiteral>
- Node* Parser<P, IsLiteral>::ParseDynamic
- ( const EncodedChar* body
- , const EncodedChar* bodyLim
- , const EncodedChar* opts
- , const EncodedChar* optsLim
- , RegexFlags& flags )
- {
- Assert(!IsLiteral);
- Assert(body != 0);
- Assert(bodyLim >= body && *bodyLim == 0);
- Assert(opts == 0 || (optsLim >= opts && *optsLim == 0));
- // Body, pass 0
- SetPosition(body, bodyLim, true);
- PatternPass0();
- if (!IsEOF())
- Fail(JSERR_RegExpSyntax);
- // Options
- if (opts != 0)
- {
- SetPosition(opts, optsLim, false);
- Options(flags);
- if (!IsEOF())
- Fail(JSERR_RegExpSyntax);
- this->unicodeFlagPresent = (flags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
- this->caseInsensitiveFlagPresent = (flags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
- Assert(!this->unicodeFlagPresent || scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
- }
- else
- {
- this->unicodeFlagPresent = false;
- this->caseInsensitiveFlagPresent = false;
- }
- // If this HR has been set, that means we have an earlier failure than the one caught above.
- if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
- {
- throw ParseError(*deferredIfNotUnicodeError);
- }
- else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
- {
- throw ParseError(*deferredIfUnicodeError);
- }
- this->currentSurrogatePairNode = this->surrogatePairList;
- // Body, pass 1
- SetPosition(body, bodyLim, true);
- Node* root = PatternPass1();
- Assert(IsEOF());
- return root;
- }
- template <typename P, const bool IsLiteral>
- Node* Parser<P, IsLiteral>::ParseLiteral
- ( const EncodedChar* input
- , const EncodedChar* inputLim
- , CharCount& outBodyEncodedChars
- , CharCount& outTotalEncodedChars
- , CharCount& outBodyChars
- , CharCount& outTotalChars
- , RegexFlags& flags )
- {
- Assert(IsLiteral);
- Assert(input != 0);
- Assert(inputLim >= input); // *inputLim need not be 0 because of deferred parsing
- // To handle surrogate pairs properly under unicode option, we will collect information on location of the pairs
- // during pass 0, regardless if the option is present. (We aren't able to get it at that time)
- // During pass 1, we will use that information to correctly create appropriate nodes.
- // Body, pass 0
- SetPosition(input, inputLim, true);
- PatternPass0();
- outBodyEncodedChars = Chars<EncodedChar>::OSB(next, input);
- outBodyChars = Pos();
- // Options are needed for the next pass
- ECMust('/', ERRnoSlash);
- Options(flags);
- this->unicodeFlagPresent = (flags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
- this->caseInsensitiveFlagPresent = (flags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
- Assert(!this->unicodeFlagPresent || scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled());
- // If this HR has been set, that means we have an earlier failure than the one caught above.
- if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
- {
- throw ParseError(*deferredIfNotUnicodeError);
- }
- else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
- {
- throw ParseError(*deferredIfUnicodeError);
- }
- // Used below to proceed to the end of the regex
- const EncodedChar *pastOptions = next;
- this->currentSurrogatePairNode = this->surrogatePairList;
- // Body, pass 1
- SetPosition(input, inputLim, true);
- Node* root = PatternPass1();
- Assert(outBodyEncodedChars == Chars<EncodedChar>::OSB(next, input));
- Assert(outBodyChars == Pos());
- next = pastOptions;
- outTotalEncodedChars = Chars<EncodedChar>::OSB(next, input);
- outTotalChars = Pos();
- return root;
- }
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::ParseLiteralNoAST
- ( const EncodedChar* input
- , const EncodedChar* inputLim
- , CharCount& outBodyEncodedChars
- , CharCount& outTotalEncodedChars
- , CharCount& outBodyChars
- , CharCount& outTotalChars )
- {
- Assert(IsLiteral);
- Assert(input != 0);
- Assert(inputLim >= input); // *inputLim need not be 0 because of deferred parsing
- // Body, pass 0
- SetPosition(input, inputLim, true);
- PatternPass0();
- outBodyEncodedChars = Chars<EncodedChar>::OSB(next, input);
- outBodyChars = Pos();
- // Options
- ECMust('/', ERRnoSlash);
- RegexFlags dummyFlags = NoRegexFlags;
- Options(dummyFlags);
- this->unicodeFlagPresent = (dummyFlags & UnifiedRegex::UnicodeRegexFlag) == UnifiedRegex::UnicodeRegexFlag;
- this->caseInsensitiveFlagPresent = (dummyFlags & UnifiedRegex::IgnoreCaseRegexFlag) == UnifiedRegex::IgnoreCaseRegexFlag;
- outTotalEncodedChars = Chars<EncodedChar>::OSB(next, input);
- outTotalChars = Pos();
- // If this HR has been set, that means we have an earlier failure than the one caught above.
- if (this->deferredIfNotUnicodeError != nullptr && !this->unicodeFlagPresent)
- {
- throw ParseError(*deferredIfNotUnicodeError);
- }
- else if(this->deferredIfUnicodeError != nullptr && this->unicodeFlagPresent)
- {
- throw ParseError(*deferredIfUnicodeError);
- }
- }
- template <typename P, const bool IsLiteral>
- template <const bool buildAST>
- RegexPattern * Parser<P, IsLiteral>::CompileProgram
- ( Node* root,
- const EncodedChar*& currentCharacter,
- const CharCount totalLen,
- const CharCount bodyChars,
- const CharCount totalChars,
- const RegexFlags flags )
- {
- Assert(IsLiteral);
- Program* program = nullptr;
- if (buildAST)
- {
- const auto recycler = this->scriptContext->GetRecycler();
- program = Program::New(recycler, flags);
- this->CaptureSourceAndGroups(recycler, program, currentCharacter, bodyChars);
- }
- currentCharacter += totalLen;
- Assert(GetMultiUnits() == totalLen - totalChars);
- if (!buildAST)
- {
- return nullptr;
- }
- RegexPattern* pattern = RegexPattern::New(this->scriptContext, program, true);
- #if ENABLE_REGEX_CONFIG_OPTIONS
- RegexStats* stats = 0;
- if (REGEX_CONFIG_FLAG(RegexProfile))
- {
- stats = this->scriptContext->GetRegexStatsDatabase()->GetRegexStats(pattern);
- this->scriptContext->GetRegexStatsDatabase()->EndProfile(stats, RegexStats::Parse);
- }
- if (REGEX_CONFIG_FLAG(RegexTracing))
- {
- DebugWriter* tw = this->scriptContext->GetRegexDebugWriter();
- tw->Print(L"// REGEX COMPILE ");
- pattern->Print(tw);
- tw->EOL();
- }
- if (REGEX_CONFIG_FLAG(RegexProfile))
- this->scriptContext->GetRegexStatsDatabase()->BeginProfile();
- #endif
- ArenaAllocator* rtAllocator = this->scriptContext->RegexAllocator();
- Compiler::Compile
- ( this->scriptContext
- , ctAllocator
- , rtAllocator
- , standardChars
- , program
- , root
- , this->GetLitbuf()
- , pattern
- #if ENABLE_REGEX_CONFIG_OPTIONS
- , w
- , stats
- #endif
- );
- #if ENABLE_REGEX_CONFIG_OPTIONS
- if (REGEX_CONFIG_FLAG(RegexProfile))
- this->scriptContext->GetRegexStatsDatabase()->EndProfile(stats, RegexStats::Compile);
- #endif
- #ifdef PROFILE_EXEC
- this->scriptContext->ProfileEnd(Js::RegexCompilePhase);
- #endif
- return pattern;
- }
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::CaptureEmptySourceAndNoGroups(Program* program)
- {
- Assert(program->source == 0);
- program->source = L"";
- program->sourceLen = 0;
- program->numGroups = 1;
- // Remaining to set during compilation: litbuf, litbufLen, numLoops, insts, instsLen, entryPointLabel
- }
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::CaptureSourceAndGroups(Recycler* recycler, Program* program, const EncodedChar* body, CharCount bodyChars)
- {
- Assert(program->source == 0);
- Assert(body != 0);
- // Program will own source string
- program->source = RecyclerNewArrayLeaf(recycler, Char, bodyChars + 1);
- // Don't need to zero out since we're writing to the buffer right here
- ConvertToUnicode(program->source, bodyChars, body);
- program->source[bodyChars] = 0;
- program->sourceLen = bodyChars;
- program->numGroups = nextGroupId;
- // Remaining to set during compilation: litbuf, litbufLen, numLoops, insts, instsLen, entryPointLabel
- }
- template <typename P, const bool IsLiteral>
- Node* Parser<P, IsLiteral>::GetNodeWithValidCharacterSet(EncodedChar cc)
- {
- Node* nodeToReturn = nullptr;
- if (this->scriptContext->GetConfig()->IsES6UnicodeExtensionsEnabled() && this->unicodeFlagPresent)
- {
- MatchSetNode *lowerRangeNode = Anew(ctAllocator, MatchSetNode, false, false);
- lowerRangeNode->set.SetRange(ctAllocator, (Char)0xD800, (Char)0xDBFF);
- MatchSetNode *upperRangeNode = Anew(ctAllocator, MatchSetNode, false, false);
- upperRangeNode->set.SetRange(ctAllocator, (Char)0xDC00, (Char)0xDFFF);
- ConcatNode* surrogateRangePairNode = Anew(ctAllocator, ConcatNode, lowerRangeNode, Anew(ctAllocator, ConcatNode, upperRangeNode, nullptr));
- // The MatchSet node will be split into [0-D7FFDC00-FFFF] (minus special characters like newline, whitespace, etc.) as a prefix, and a suffix of [D800-DBFF]
- // i.e. The MatchSet node can be with [0-D7FFDC00-FFFF] (minus special characters like newline, whitespace, etc.) OR [D800-DBFF]
- MatchSetNode* partialPrefixSetNode = Anew(ctAllocator, MatchSetNode, false, false);
- switch (cc)
- {
- case '.':
- standardChars->SetNonNewline(ctAllocator, partialPrefixSetNode->set);
- break;
- case 'S':
- standardChars->SetNonWhitespace(ctAllocator, partialPrefixSetNode->set);
- break;
- case 'D':
- standardChars->SetNonDigits(ctAllocator, partialPrefixSetNode->set);
- break;
- case 'W':
- standardChars->SetNonWordChars(ctAllocator, partialPrefixSetNode->set);
- break;
- default:
- AssertMsg(false, "");
- }
- partialPrefixSetNode->set.SubtractRange(ctAllocator, (Char)0xD800u, (Char)0xDBFFu);
- MatchSetNode* partialSuffixSetNode = Anew(ctAllocator, MatchSetNode, false, false);
- partialSuffixSetNode->set.SetRange(ctAllocator, (Char)0xD800u, (Char)0xDBFFu);
- AltNode* altNode = Anew(ctAllocator, AltNode, partialPrefixSetNode, Anew(ctAllocator, AltNode, surrogateRangePairNode, Anew(ctAllocator, AltNode, partialSuffixSetNode, nullptr)));
- nodeToReturn = altNode;
- }
- else
- {
- MatchSetNode* setNode = Anew(ctAllocator, MatchSetNode, false, false);
- switch (cc)
- {
- case '.':
- standardChars->SetNonNewline(ctAllocator, setNode->set);
- break;
- case 'S':
- standardChars->SetNonWhitespace(ctAllocator, setNode->set);
- break;
- case 'D':
- standardChars->SetNonDigits(ctAllocator, setNode->set);
- break;
- case 'W':
- standardChars->SetNonWordChars(ctAllocator, setNode->set);
- break;
- default:
- AssertMsg(false, "");
- }
- nodeToReturn = setNode;
- }
- return nodeToReturn;
- }
- template <typename P, const bool IsLiteral>
- void Parser<P, IsLiteral>::FreeBody()
- {
- if (litbuf != 0)
- {
- ctAllocator->Free(litbuf, litbufLen);
- litbuf = 0;
- litbufLen = 0;
- litbufNext = 0;
- }
- }
- //
- // Template instantiation
- //
- template <typename P, const bool IsLiteral>
- void UnifiedRegexParserForceInstantiation()
- {
- typedef typename P::EncodedChar EncodedChar;
- Parser<P, IsLiteral> p
- ( 0
- , 0
- , 0
- , 0
- , false
- #if ENABLE_REGEX_CONFIG_OPTIONS
- , 0
- #endif
- );
- RegexFlags f;
- CharCount a, b, c, d;
- const EncodedChar* cp = 0;
- p.ParseDynamic(0, 0, 0, 0, f);
- p.ParseLiteral(0, 0, a, b, c, d, f);
- p.ParseLiteralNoAST(0, 0, a, b, c, d);
- p.CompileProgram<true>(0, cp, a, b, c, f);
- p.CompileProgram<false>(0, cp, a, b, c, f);
- p.CaptureEmptySourceAndNoGroups(0);
- p.CaptureSourceAndGroups(0, 0, 0, 0);
- p.FreeBody();
- }
- void UnifiedRegexParserForceAllInstantiations()
- {
- UnifiedRegexParserForceInstantiation<NullTerminatedUnicodeEncodingPolicy, false>();
- UnifiedRegexParserForceInstantiation<NullTerminatedUnicodeEncodingPolicy, true>();
- UnifiedRegexParserForceInstantiation<NullTerminatedUTF8EncodingPolicy, false>();
- UnifiedRegexParserForceInstantiation<NullTerminatedUTF8EncodingPolicy, true>();
- UnifiedRegexParserForceInstantiation<NotNullTerminatedUTF8EncodingPolicy, false>();
- UnifiedRegexParserForceInstantiation<NotNullTerminatedUTF8EncodingPolicy, true>();
- }
- }
|