pnodediff.h 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. //-----------------------------------------------------------------------------
  7. enum EditKind
  8. {
  9. // No change.
  10. None = 0,
  11. // Node value was updated.
  12. Update,
  13. // Node was inserted.
  14. Insert,
  15. // Node was deleted.
  16. Delete,
  17. // Node changed parent.
  18. Move,
  19. // Node changed position within its parent. The parent nodes of the old node and the new node are matching.
  20. Reorder,
  21. };
  22. //-----------------------------------------------------------------------------
  23. // Calculates Longest Common Subsequence.
  24. // This uses the basic version in
  25. //
  26. // EUGENE W. MYERS: An O(ND) Difference Algorithm and Its Variations
  27. //
  28. // The idea is that LCS is a dual problem of shortest path in an edit graph. The edit graph is a grid of lengthA
  29. // columns and lengthB rows. A path starts from (0,0) and moves toward (lengthA, lengthB).
  30. // - A horizontal move (i,j) -> (i+1,j) represents deleting A[i].
  31. // - A vertical move (i,j) -> (i,j+1) represents inserting B[j].
  32. // - A diagonal move (i,j) -> (i+1,j+1) represents a match, A[i] == B[j].
  33. // Each diagonal move represents a match. We want more diagonal moves. Let diagonal move cost 0, horizontal or
  34. // vertical move each costs 1. The basic algorithm is a greedy algorithm to find a shortest path from (0,0) to
  35. // (lengthA, lengthB).
  36. //
  37. // Terms:
  38. // diagonal k: The diagonal where x-y==k.
  39. // d-path: A path starting from (0,0) with d number of horizontal or vertical moves. Or, its length is d (note
  40. // that each horizontal/vertical move costs 1, diagonal move costs 0).
  41. //
  42. // 0-path can only move along and end on diagonal 0.
  43. // 1-path can only end on diagonal -1 or 1.
  44. // d-path can end on diagonal [-d, -d+2, ..., d-2, d].
  45. //
  46. // The basic algorithm tries to find the smallest d, where there is a d-path reaches (lengthA, lengthB).
  47. //-----------------------------------------------------------------------------
  48. template <class Allocator>
  49. class LongestCommonSubsequence
  50. {
  51. private:
  52. // Stores d-path furthest reaching endpoints. They can be on diagonal [-d, -d+2, ..., d-2, d].
  53. class EndPoints
  54. {
  55. private:
  56. int d;
  57. int x[]; // Stores x for endpoints on the (d+1) diagonals. y == x - k.
  58. EndPoints(int d) : d(d)
  59. {
  60. }
  61. public:
  62. int getd() const
  63. {
  64. return d;
  65. }
  66. // Get x of furthest reaching endpoint on diagonal k.
  67. int operator[](int k) const
  68. {
  69. Assert(k >= -d && k <= d && (d - k) % 2 == 0); // k must be in [-d, -d+2, ..., d-2, d]
  70. int i = (k + d) / 2;
  71. return x[i];
  72. }
  73. // Get x reference of furthest reaching endpoint on diagonal k.
  74. int& operator[](int k)
  75. {
  76. Assert(k >= -d && k <= d && (d - k) % 2 == 0); // k must be in [-d, -d+2, ..., d-2, d]
  77. int i = (k + d) / 2;
  78. return x[i];
  79. }
  80. static EndPoints* New(Allocator* alloc, int d)
  81. {
  82. Assert(d >= 0);
  83. return AllocatorNewPlusLeaf(Allocator, alloc, sizeof(int) * (d + 1), EndPoints, d);
  84. }
  85. void Destroy(Allocator* alloc)
  86. {
  87. AllocatorDeletePlusLeaf(Allocator, alloc, sizeof(int) * (d + 1), this);
  88. }
  89. };
  90. // Represents an EditGraph for finding LCS
  91. class EditGraph
  92. {
  93. private:
  94. typedef JsUtil::List<EndPoints*, Allocator> EndPointsList;
  95. EndPointsList m_endPoints; // Stores endPoints for paths: -1, 0, 1, ..., d
  96. int m_diagonal; // The final diagonal found on d-path that reaches destination
  97. // Add EndPoints storage for d-path
  98. EndPoints* AddEndPoints(int d)
  99. {
  100. int i = m_endPoints.Add(nullptr);
  101. EndPoints* e = EndPoints::New(m_endPoints.GetAllocator(), d);
  102. m_endPoints.Item(i, e);
  103. return e;
  104. }
  105. public:
  106. EditGraph(Allocator* alloc) : m_endPoints(alloc) {}
  107. ~EditGraph()
  108. {
  109. Allocator* alloc = m_endPoints.GetAllocator();
  110. m_endPoints.Map([=](int, EndPoints* e)
  111. {
  112. if (e)
  113. {
  114. e->Destroy(alloc);
  115. }
  116. });
  117. }
  118. //
  119. // This is the basic algorithm to find a shortest path in the edit graph from (0,0) to (lengthA, lengthB).
  120. // We iterate through d=0,1,2,... to find smallest d, where one d-path reaches (lengthA, lengthB).
  121. // - d-path must end on diagonal [-d, -d+2, ..., d-2, d]
  122. // - A furthest reaching d-path on diagonal k is composed of a furthest reaching d-1 path on diagonal k-1 or k+1,
  123. // followed by a vertical or horizontal move, followed by moving along diagonal k.
  124. //
  125. template <class ItemEquals>
  126. void FindPath(int lengthA, int lengthB, const ItemEquals& equals)
  127. {
  128. Assert(m_endPoints.Empty()); // Only support one FindPath
  129. int maxD;
  130. if (Int32Math::Add(lengthA, lengthB, &maxD) || maxD > INT_MAX / 2) // Limits maxD to simplify overflow handling
  131. {
  132. Math::DefaultOverflowPolicy();
  133. }
  134. // Pre-add virtual path -1
  135. {
  136. EndPoints& pre = *AddEndPoints(1);
  137. pre[1] = 0;
  138. }
  139. bool found = false;
  140. for (int d = 0; d <= maxD && !found; d++)
  141. {
  142. const EndPoints& v = *m_endPoints.Item(d); // d-1 path
  143. EndPoints& cur = *AddEndPoints(d); // d path
  144. for (int k = -d; k <= d; k += 2)
  145. {
  146. const bool verticalMove = (k == -d || (k != d && v[k - 1] < v[k + 1]));
  147. int x = verticalMove ? v[k + 1] : v[k - 1] + 1;
  148. int y = x - k;
  149. while (x < lengthA && y < lengthB && equals(x, y))
  150. {
  151. x++;
  152. y++;
  153. }
  154. cur[k] = x; // furthest reaching end point
  155. if (x == lengthA && y == lengthB)
  156. {
  157. m_diagonal = k;
  158. found = true;
  159. break;
  160. }
  161. }
  162. }
  163. Assert(found);
  164. }
  165. template <class Func>
  166. void MapEdits(const Func& map) const
  167. {
  168. // m_endPoints contains endPoints for paths: -1, 0, 1, ..., d
  169. int d = m_endPoints.Count() - 2;
  170. int k = m_diagonal;
  171. for (; d >= 0; d--)
  172. {
  173. const EndPoints& v = *m_endPoints.Item(d); // d-1 path
  174. const EndPoints& cur = *m_endPoints.Item(d + 1); // d path
  175. Assert(cur.getd() == d);
  176. const bool verticalMove = (k == -d || (k != d && v[k - 1] < v[k + 1]));
  177. int x0 = verticalMove ? v[k + 1] : v[k - 1] + 1;
  178. int x = cur[k];
  179. int y = x - k;
  180. while (x > x0)
  181. {
  182. map(EditKind::Update, --x, --y);
  183. }
  184. if (verticalMove)
  185. {
  186. if (d > 0) // Don't emit virtual initial move from path -1 to 0
  187. {
  188. map(EditKind::Insert, -1, --y);
  189. }
  190. k++;
  191. }
  192. else
  193. {
  194. map(EditKind::Delete, --x, -1);
  195. k--;
  196. }
  197. }
  198. }
  199. };
  200. struct Edit
  201. {
  202. EditKind kind;
  203. int indexA;
  204. int indexB;
  205. Edit() {}
  206. Edit(EditKind kind, int indexA, int indexB) :
  207. kind(kind), indexA(indexA), indexB(indexB)
  208. {
  209. Assert((kind == EditKind::Insert && indexA == -1 && indexB >= 0)
  210. || (kind == EditKind::Delete && indexA >= 0 && indexB == -1)
  211. || (kind == EditKind::Update && indexA >= 0 && indexB >= 0));
  212. }
  213. };
  214. typedef JsUtil::List<Edit, Allocator, /*isLeaf*/true> EditList;
  215. EditList m_edits;
  216. public:
  217. template <class ItemEquals>
  218. LongestCommonSubsequence(Allocator* alloc, int lengthA, int lengthB, const ItemEquals& equals) :
  219. m_edits(alloc)
  220. {
  221. EditGraph graph(alloc);
  222. graph.FindPath(lengthA, lengthB, equals);
  223. graph.MapEdits([this](EditKind kind, int indexA, int indexB)
  224. {
  225. m_edits.Add(Edit(kind, indexA, indexB));
  226. });
  227. }
  228. template <class Func>
  229. void MapEdits(const Func& map) const
  230. {
  231. for (int i = m_edits.Count() - 1; i >= 0; i--)
  232. {
  233. const Edit& e = m_edits.Item(i);
  234. map(e.kind, e.indexA, e.indexB);
  235. }
  236. }
  237. template <class Func>
  238. void MapMatches(const Func& map) const
  239. {
  240. MapEdits([&](EditKind kind, int indexA, int indexB)
  241. {
  242. if (kind == EditKind::Update)
  243. {
  244. map(indexA, indexB);
  245. }
  246. });
  247. }
  248. };
  249. //
  250. // Returns a distance [0..1] of the specified sequences. The smaller distance the more of their elements match.
  251. //
  252. template <class Allocator, class ItemEquals>
  253. double ComputeLongestCommonSubsequenceDistance(Allocator* alloc, int lengthA, int lengthB, const ItemEquals& equals)
  254. {
  255. Assert(lengthA >= 0 && lengthB >= 0);
  256. if (lengthA == 0 || lengthB == 0)
  257. {
  258. return (lengthA == lengthB) ? 0.0 : 1.0;
  259. }
  260. int lcsLength = 0;
  261. LongestCommonSubsequence<Allocator> lcs(alloc, lengthA, lengthB, equals);
  262. lcs.MapMatches([&](int, int)
  263. {
  264. ++lcsLength;
  265. });
  266. return 1.0 - (double)lcsLength / (double)max(lengthA, lengthB);
  267. }
  268. //-----------------------------------------------------------------------------
  269. // Base class for TreeComparers, used with TreeMatch. TreeComparers specify parse node details.
  270. //-----------------------------------------------------------------------------
  271. template <class SubClass, class Node>
  272. struct TreeComparerBase
  273. {
  274. typedef Node Node;
  275. typedef Node* PNode;
  276. static const double ExactMatchDistance;
  277. static const double EpsilonDistance;
  278. const SubClass* pThis() const { return static_cast<const SubClass*>(this); }
  279. SubClass* pThis() { return static_cast<SubClass*>(this); }
  280. // The number of distinct labels used in the tree.
  281. int LabelCount() const { return 0; }
  282. // Returns an integer label corresponding to the given node.
  283. // Returned value must be within [0, LabelCount).
  284. int GetLabel(PNode x) const { return 0; }
  285. // Returns N > 0 if the node with specified label can't change its N-th ancestor node, zero otherwise.
  286. // 1st ancestor is the node's parent node.
  287. // 2nd ancestor is the node's grandparent node.
  288. // etc.
  289. int TiedToAncestor(int label) { return 0; }
  290. // Calculates the distance [0..1] of two nodes.
  291. // The more similar the nodes the smaller the distance.
  292. //
  293. // Used to determine whether two nodes of the same label match.
  294. // Even if 0 is returned the nodes might be slightly different.
  295. double GetDistance(PNode x, PNode y) const { return 0; }
  296. // Returns true if the specified nodes have equal values.
  297. // Called with matching nodes (oldNode, newNode).
  298. // Return true if the values of the nodes are the same, or their difference is not important.
  299. bool ValuesEqual(PNode oldNode, PNode newNode) const { return true; }
  300. PNode GetParent(PNode x) const { return nullptr; }
  301. bool TryGetParent(PNode x, _Out_ PNode* p) const
  302. {
  303. *p = pThis()->GetParent(x);
  304. return *p != nullptr;
  305. }
  306. PNode GetAncestor(PNode node, int level) const
  307. {
  308. while (level > 0)
  309. {
  310. node = pThis()->GetParent(node);
  311. level--;
  312. }
  313. return node;
  314. }
  315. // Map children nodes of x
  316. template <class Func>
  317. void MapChildren(PNode x, const Func& func) const {}
  318. // Map all descendant nodes of x (not including x itself)
  319. template <class Func>
  320. void MapDescendants(PNode x, const Func& func) const
  321. {
  322. pThis()->MapChildren(x, [&](PNode child)
  323. {
  324. func(child);
  325. MapDescendants(child, func);
  326. });
  327. }
  328. // Map every node in the (sub)tree x.
  329. template <class Func>
  330. void MapTree(PNode x, const Func& func) const
  331. {
  332. func(x);
  333. pThis()->MapDescendants(x, func);
  334. }
  335. // Return true if specified nodes belong to the same tree. For debug only.
  336. bool TreesEqual(PNode left, PNode right) const { return true; }
  337. };
  338. template <class SubClass, class Node> const double TreeComparerBase<SubClass, Node>::ExactMatchDistance = 0.0;
  339. template <class SubClass, class Node> const double TreeComparerBase<SubClass, Node>::EpsilonDistance = 0.00001;
  340. //-----------------------------------------------------------------------------
  341. // Tree match algorithm, based on general algorithm described in
  342. // Change Detection in Hierarchically Structured Information
  343. // by Sudarshan S. Chawathe, Anand Rajaraman, Hector Garcia-Molina, and Jennifer Widom
  344. //
  345. // Derived from Roslyn implementation.
  346. //-----------------------------------------------------------------------------
  347. template <class TreeComparer, class Allocator>
  348. class TreeMatch
  349. {
  350. public:
  351. // ParseNodes are owned by Parser arena. Considered leaf here.
  352. typedef typename TreeComparer::PNode PNode;
  353. typedef JsUtil::List<PNode, Allocator, /*isLeaf*/true> NodeList;
  354. typedef JsUtil::BaseDictionary<PNode, PNode, typename ForceLeafAllocator<Allocator>::AllocatorType> NodeMap;
  355. private:
  356. static const double ExactMatchDistance;
  357. static const double EpsilonDistance;
  358. static const double MatchingDistance1;
  359. static const double MatchingDistance2;
  360. static const double MatchingDistance3;
  361. static const double MaxDistance;
  362. Allocator* alloc;
  363. const PNode root1;
  364. const PNode root2;
  365. TreeComparer comparer;
  366. NodeMap* oneToTwo;
  367. NodeMap* twoToOne;
  368. public:
  369. TreeMatch(Allocator* alloc, PNode root1, PNode root2, const TreeComparer& comparer = TreeComparer()) :
  370. alloc(alloc), root1(root1), root2(root2), comparer(comparer)
  371. {
  372. const int labelCount = comparer.LabelCount();
  373. // calculate chains (not including root node)
  374. AutoAllocatorObjectArrayPtr<NodeList, Allocator> nodes1(AllocatorNewArrayZ(Allocator, alloc, NodeList*, labelCount), labelCount, alloc);
  375. AutoAllocatorObjectArrayPtr<NodeList, Allocator> nodes2(AllocatorNewArrayZ(Allocator, alloc, NodeList*, labelCount), labelCount, alloc);
  376. int count1 = CategorizeNodesByLabels(root1, labelCount, nodes1);
  377. int count2 = CategorizeNodesByLabels(root2, labelCount, nodes2);
  378. AutoAllocatorObjectPtr<NodeMap, Allocator> map1(AllocatorNew(Allocator, alloc, NodeMap, alloc, count1), alloc);
  379. AutoAllocatorObjectPtr<NodeMap, Allocator> map2(AllocatorNew(Allocator, alloc, NodeMap, alloc, count2), alloc);
  380. this->oneToTwo = map1;
  381. this->twoToOne = map2;
  382. ComputeMatch(nodes1, nodes2, labelCount);
  383. // Succeeded. Detach local objects that are now owned by this instance.
  384. map1.Detach();
  385. map2.Detach();
  386. }
  387. ~TreeMatch()
  388. {
  389. DeleteObject<Allocator>(alloc, oneToTwo);
  390. DeleteObject<Allocator>(alloc, twoToOne);
  391. }
  392. const TreeComparer& Comparer() const { return comparer; }
  393. PNode OldRoot() const { return root1; }
  394. PNode NewRoot() const { return root2; }
  395. bool HasPartnerInTree1(PNode node2) const
  396. {
  397. Assert(comparer.TreesEqual(node2, root2));
  398. return twoToOne->ContainsKey(node2);
  399. }
  400. bool HasPartnerInTree2(PNode node1) const
  401. {
  402. Assert(comparer.TreesEqual(node1, root1));
  403. return oneToTwo->ContainsKey(node1);
  404. }
  405. bool TryGetPartnerInTree1(PNode node2, PNode* partner1) const
  406. {
  407. Assert(comparer.TreesEqual(node2, root2));
  408. return twoToOne->TryGetValue(node2, partner1);
  409. }
  410. bool TryGetPartnerInTree2(PNode node1, PNode* partner2) const
  411. {
  412. Assert(comparer.TreesEqual(node1, root1));
  413. return oneToTwo->TryGetValue(node1, partner2);
  414. }
  415. bool Contains(PNode node1, PNode node2) const
  416. {
  417. Assert(comparer.TreesEqual(node2, root2));
  418. PNode partner2;
  419. return TryGetPartnerInTree2(node1, &partner2) && node2 == partner2;
  420. }
  421. private:
  422. int CategorizeNodesByLabels(PNode root, int labelCount, _Out_writes_(labelCount) NodeList* nodes[])
  423. {
  424. int count = 0;
  425. comparer.MapDescendants(root, [&](PNode node)
  426. {
  427. int label = comparer.GetLabel(node);
  428. Assert(label >= 0 && label < labelCount);
  429. NodeList* list = nodes[label];
  430. if (!list)
  431. {
  432. list = NodeList::New(alloc);
  433. nodes[label] = list;
  434. }
  435. list->Add(node);
  436. count++;
  437. });
  438. return count;
  439. }
  440. void ComputeMatch(_In_reads_(labelCount) NodeList* nodes1[], _In_reads_(labelCount) NodeList* nodes2[], int labelCount)
  441. {
  442. // Root nodes always match but they might have been added as knownMatches
  443. if (!HasPartnerInTree2(root1))
  444. {
  445. Add(root1, root2);
  446. }
  447. // --- The original FastMatch algorithm ---
  448. //
  449. // For each leaf label l, and then for each internal node label l do:
  450. // a) S1 := chain T1(l)
  451. // b) S2 := chain T2(l)
  452. // c) lcs := LCS(S1, S2, Equal)
  453. // d) For each pair of nodes (x,y) in lcs add (x,y) to M.
  454. // e) Pair unmatched nodes with label l as in Algorithm Match, adding matches to M:
  455. // For each unmatched node x in T1, if there is an unmatched node y in T2 such that equal(x,y)
  456. // then add (x,y) to M.
  457. //
  458. // equal(x,y) is defined as follows:
  459. // x, y are leafs => equal(x,y) := label(x) == label(y) && compare(value(x), value(y)) <= f
  460. // x, y are nodes => equal(x,y) := label(x) == label(y) && |common(x,y)| / max(|x|, |y|) > t
  461. // where f, t are constants.
  462. //
  463. // --- Actual implementation ---
  464. //
  465. // We also categorize nodes by their labels, but then we proceed differently:
  466. //
  467. // 1) A label may be marked "tied to parent". Let x, y have both label l and l is "tied to parent".
  468. // Then (x,y) can be in M only if (parent(x), parent(y)) in M.
  469. // Thus we require labels of children tied to a parent to be preceded by all their possible parent labels.
  470. //
  471. // 2) Rather than defining function equal in terms of constants f and t, which are hard to get right,
  472. // we try to match multiple times with different thresholds for node distance.
  473. // The comparer defines the distance [0..1] between two nodes and it can do so by analyzing
  474. // the node structure and value. The comparer can tune the distance specifically for each node kind.
  475. // We first try to match nodes of the same labels to the exactly matching or almost matching counterparts.
  476. // Then we keep increasing the threshold and keep adding matches.
  477. for (int label = 0; label < labelCount; label++)
  478. {
  479. if (nodes1[label] && nodes2[label])
  480. {
  481. ComputeMatchForLabel(label, *nodes1[label], *nodes2[label]);
  482. }
  483. }
  484. }
  485. void ComputeMatchForLabel(int label, NodeList& s1, NodeList& s2)
  486. {
  487. int tiedToAncestor = comparer.TiedToAncestor(label);
  488. ComputeMatchForLabel(s1, s2, tiedToAncestor, EpsilonDistance); // almost exact match
  489. ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance1); // ok match
  490. ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance2); // ok match
  491. ComputeMatchForLabel(s1, s2, tiedToAncestor, MatchingDistance3); // ok match
  492. ComputeMatchForLabel(s1, s2, tiedToAncestor, MaxDistance); // any match
  493. }
  494. void ComputeMatchForLabel(NodeList& s1, NodeList& s2, int tiedToAncestor, double maxAcceptableDistance)
  495. {
  496. // Obviously, the algorithm below is O(n^2). However, in the common case, the 2 lists will
  497. // be sequences that exactly match. The purpose of "firstNonMatch2" is to reduce the complexity
  498. // to O(n) in this case. Basically, the pointer is the 1st non-matched node in the list of nodes of tree2
  499. // with the given label.
  500. // Whenever we match to firstNonMatch2 we set firstNonMatch2 to the subsequent node.
  501. // So in the case of totally matching sequences, we process them in O(n) -
  502. // both node1 and firstNonMatch2 will be advanced simultaneously.
  503. UnmatchedIterator i1(s1);
  504. for (;;)
  505. {
  506. PNode node1 = i1.GetNextUnmatched();
  507. if (!node1) break;
  508. Assert(!HasPartnerInTree2(node1));
  509. // Find node2 that matches node1 the best, i.e. has minimal distance.
  510. double bestDistance = MaxDistance;
  511. PNode bestMatch = nullptr;
  512. int bestMatchIndex = -1; // node1's best match index in list2
  513. bool matched = false;
  514. UnmatchedIterator i2(s2);
  515. for (;;)
  516. {
  517. PNode node2 = i2.GetNextUnmatched();
  518. if (!node2) break;
  519. Assert(!HasPartnerInTree1(node2));
  520. // this requires parents to be processed before their children:
  521. if (tiedToAncestor > 0)
  522. {
  523. // TODO: For nodes tied to their parents,
  524. // consider avoiding matching them to all other nodes of the same label.
  525. // Rather we should only match them with their siblings that share the same parent.
  526. PNode ancestor1 = comparer.GetAncestor(node1, tiedToAncestor);
  527. PNode ancestor2 = comparer.GetAncestor(node2, tiedToAncestor);
  528. Assert(comparer.GetLabel(ancestor1) < comparer.GetLabel(node1));
  529. if (!Contains(ancestor1, ancestor2))
  530. {
  531. continue;
  532. }
  533. }
  534. // We know that
  535. // 1. (node1, node2) not in M
  536. // 2. Both of their parents are matched to the same parent (or are not matched)
  537. //
  538. // Now, we have no other choice than comparing the node "values"
  539. // and looking for the one with the smaller distance.
  540. //
  541. double distance = comparer.GetDistance(node1, node2);
  542. if (distance < bestDistance)
  543. {
  544. matched = true;
  545. bestMatch = node2;
  546. bestMatchIndex = i2.CurIndex();
  547. bestDistance = distance;
  548. // We only stop if we've got an exact match. This is to resolve the problem
  549. // of entities with identical names(name is often used as the "value" of a
  550. // node) but with different "sub-values" (e.g. two locals may have the same name
  551. // but different types. Since the type is not part of the value, we don't want
  552. // to stop looking for the best match if we don't have an exact match).
  553. if (distance == ExactMatchDistance)
  554. {
  555. break;
  556. }
  557. }
  558. }
  559. if (matched && bestDistance <= maxAcceptableDistance)
  560. {
  561. Add(node1, bestMatch);
  562. i1.MarkCurrentMatched(); // i1's match is current node1
  563. i2.MarkMatched(bestMatchIndex); // i2's match is one of the nodes examined in the above for(;;) pass
  564. }
  565. }
  566. }
  567. void Add(PNode node1, PNode node2)
  568. {
  569. Assert(comparer.TreesEqual(node1, root1));
  570. Assert(comparer.TreesEqual(node2, root2));
  571. oneToTwo->Add(node1, node2);
  572. twoToOne->Add(node2, node1);
  573. }
  574. // The customized Match algorithm iterates over the 2 node lists, compares every unmatched node pair to match nodes.
  575. // To find the next unmatched node, original algorithm iterates over every node in each list, use a dictionary lookup
  576. // to test if the node has been matched or not, until it sees next unmatched node. This could be very expensive if the
  577. // lists are huge. E.g., assume the only diff is inserting a new node at the beginning of list2. Then for each node in
  578. // list1, it checks every node starting from the beginning new node in list2 for next unmatched node. This results in
  579. // O(N^2) dictionary lookups. And we do 5 passes of these.
  580. //
  581. // To improve on this, we can try to record every match span and directly jump to next unmatched position. Note that
  582. // in both lists once a node is matched, the list entry is no longer used. We can reuse that space to record extra info.
  583. // * Original PNode pointer value must be at even address. The list item must have 0 at bit0 (lowest bit).
  584. // * Once a node is matched, mark 1 at bit0. With this we can get rid of dictionary lookup.
  585. // * Next, for each matched entry, use the upper bits to record "next" unmatched index. Try to maintain match span,
  586. // so that from a matched node we can directly jump to next unmatched index.
  587. //
  588. // This class is for above purpose. Expected call pattern:
  589. // * GetNextUnmatched, [MarkCurrentMatched], GetNextUnmatched, [MarkCurrentMatched], ...
  590. // -- (A) With first MarkCurrentMatched we know the start of a match span.
  591. // -- (B) Subsequent MarkCurrentMatched indicates continuous match span.
  592. // -- (C) When MarkCurrentMatched is not called for an entry, we know the end of a match span. Record the whole
  593. // span (A)->(C). If walked again we would directly jump from (A) to (C).
  594. // * Random MarkMatched(i)
  595. // -- We don't know the exact match span. Just mark this entry "i" as matched, but set its "next" (upper bits) to 0.
  596. // -- During next pass, we can merge all adjacent match spans and individual matched entries to bigger match spans.
  597. // This would help next pass (we have 5).
  598. //
  599. class UnmatchedIterator
  600. {
  601. private:
  602. NodeList& list;
  603. int lastMatched; // last matched node index. -1 means no known last matched index.
  604. int index; // current examining index. Only moved by GetNextUnmatched().
  605. public:
  606. UnmatchedIterator(NodeList& list) :
  607. list(list),
  608. lastMatched(-1),
  609. index(-1)
  610. {
  611. VerifySize(list);
  612. }
  613. ~UnmatchedIterator()
  614. {
  615. // If we have lastMatched, we could have one of following:
  616. // * index is matched by MarkCurrentMatched(). Link lastMatched -> index (== lastMatched). GetNextUnmatched() can handle it.
  617. // * index remains unmatched (ends a matched sequence). Link lastMatched -> index.
  618. // * index is out of range. That means [lastMatched, ...end) are all matched. Link lastMatched -> index (out of range).
  619. //
  620. if (lastMatched >= 0)
  621. {
  622. SetNext(lastMatched, index);
  623. }
  624. }
  625. PNode GetNextUnmatched()
  626. {
  627. // If current ends a matched sequence, make a link [lastMatched -> current).
  628. if (lastMatched >= 0 && !IsMatched(index))
  629. {
  630. SetNext(lastMatched, index);
  631. lastMatched = -1;
  632. }
  633. ++index;
  634. if (index < list.Count())
  635. {
  636. if (IsMatched(index))
  637. {
  638. if (lastMatched < 0) // Check if current starts a matched sequence
  639. {
  640. lastMatched = index;
  641. }
  642. // Jumps all matched span, until sees an unmatched entry or the end.
  643. int next;
  644. while (index < list.Count() && IsNext(list.Item(index), &next))
  645. {
  646. index = max(next, index + 1); // Ensure moves forward (next could be 0, from individual MarkMatched() call).
  647. }
  648. }
  649. if (index < list.Count())
  650. {
  651. return list.Item(index);
  652. }
  653. }
  654. return nullptr;
  655. }
  656. int CurIndex() const { return index; }
  657. void MarkMatched(int i)
  658. {
  659. if (i == index)
  660. {
  661. MarkCurrentMatched();
  662. }
  663. else
  664. {
  665. SetMatched(i);
  666. }
  667. }
  668. void MarkCurrentMatched()
  669. {
  670. Assert(!IsMatched(index));
  671. SetMatched(index);
  672. if (lastMatched < 0) // If current starts a matched sequence
  673. {
  674. lastMatched = index;
  675. }
  676. }
  677. private:
  678. static void VerifySize(const NodeList& list)
  679. {
  680. if (list.Count() > INT_MAX / 2) // Limit max size as we used bit0
  681. {
  682. Math::DefaultOverflowPolicy();
  683. }
  684. }
  685. template <class P>
  686. static void SetMatched(P& node)
  687. {
  688. SetNext(node, 0);
  689. }
  690. static bool IsMatched(PNode node)
  691. {
  692. return !!(reinterpret_cast<UINT_PTR>(node) & 1);
  693. }
  694. template <class P>
  695. static void SetNext(P& node, int next)
  696. {
  697. UINT_PTR value = (static_cast<UINT_PTR>(next) << 1) | 1;
  698. node = reinterpret_cast<PNode>(value);
  699. }
  700. static bool IsNext(PNode node, _Out_ int* next)
  701. {
  702. UINT_PTR value = reinterpret_cast<UINT_PTR>(node);
  703. if (value & 1)
  704. {
  705. *next = static_cast<int>(value >> 1);
  706. return true;
  707. }
  708. return false;
  709. }
  710. void SetMatched(int i) { SetMatched(list.Item(i)); }
  711. bool IsMatched(int i) const { return IsMatched(list.Item(i)); }
  712. void SetNext(int i, int next) { SetNext(list.Item(i), next); }
  713. bool IsNext(int i, _Out_ int* next) const { return IsNext(list.Item(i), next); }
  714. };
  715. };
  716. template <class TreeComparer, class Allocator> const double TreeMatch<TreeComparer, Allocator>::ExactMatchDistance = TreeComparer::ExactMatchDistance;
  717. template <class TreeComparer, class Allocator> const double TreeMatch<TreeComparer, Allocator>::EpsilonDistance = TreeComparer::EpsilonDistance;
  718. template <class TreeComparer, class Allocator> const double TreeMatch<TreeComparer, Allocator>::MatchingDistance1 = 0.5;
  719. template <class TreeComparer, class Allocator> const double TreeMatch<TreeComparer, Allocator>::MatchingDistance2 = 1.0;
  720. template <class TreeComparer, class Allocator> const double TreeMatch<TreeComparer, Allocator>::MatchingDistance3 = 1.5;
  721. template <class TreeComparer, class Allocator> const double TreeMatch<TreeComparer, Allocator>::MaxDistance = 2.0;
  722. //-----------------------------------------------------------------------------
  723. // Represents an edit operation on a tree or a sequence of nodes.
  724. //-----------------------------------------------------------------------------
  725. template <class PNode>
  726. class Edit
  727. {
  728. private:
  729. EditKind kind;
  730. PNode node1;
  731. PNode node2;
  732. public:
  733. Edit() {}
  734. //
  735. // Insert nullptr NewNode
  736. // Delete OldNode nullptr
  737. // Move/Update OldNode NewNode
  738. //
  739. Edit(EditKind kind, PNode node1, PNode node2) :
  740. kind(kind), node1(node1), node2(node2)
  741. {
  742. Assert((node1 == nullptr) == (kind == EditKind::Insert));
  743. Assert((node2 == nullptr) == (kind == EditKind::Delete));
  744. }
  745. EditKind Kind() const { return kind; }
  746. PNode OldNode() const { return node1; }
  747. PNode NewNode() const { return node2; }
  748. };
  749. //-----------------------------------------------------------------------------
  750. // Represents a sequence of tree edits.
  751. //-----------------------------------------------------------------------------
  752. template <class TreeComparer, class Allocator>
  753. class EditScript
  754. {
  755. public:
  756. typedef TreeMatch<TreeComparer, Allocator> TreeMatch;
  757. typedef typename TreeMatch::PNode PNode;
  758. typedef typename TreeMatch::NodeList NodeList;
  759. typedef typename TreeMatch::NodeMap NodeMap;
  760. typedef JsUtil::List<Edit<PNode>, Allocator, /*isLeaf*/true> EditList;
  761. private:
  762. const TreeMatch& match;
  763. TreeComparer comparer;
  764. EditList edits;
  765. public:
  766. EditScript(Allocator* alloc, const TreeMatch& match) :
  767. match(match), comparer(match.Comparer()), edits(alloc)
  768. {
  769. AddUpdatesInsertsMoves();
  770. AddDeletes();
  771. }
  772. const EditList& Edits() const { return edits; }
  773. private:
  774. PNode Root1() const { return match.OldRoot(); }
  775. PNode Root2() const { return match.NewRoot(); }
  776. void AddUpdatesInsertsMoves()
  777. {
  778. // Breadth-first traversal.
  779. ProcessNode(Root2());
  780. JsUtil::Queue<PNode, Allocator> queue(edits.GetAllocator());
  781. queue.Enqueue(Root2());
  782. while (!queue.Empty())
  783. {
  784. PNode head = queue.Dequeue();
  785. comparer.MapChildren(head, [&](PNode child)
  786. {
  787. ProcessNode(child);
  788. queue.Enqueue(child);
  789. });
  790. }
  791. }
  792. void ProcessNode(PNode x)
  793. {
  794. Assert(comparer.TreesEqual(x, Root2()));
  795. // NOTE:
  796. // Our implementation differs from the algorithm described in the paper in following:
  797. // - We don't update M' and T1 since we don't need the final matching and the transformed tree.
  798. // - Insert and Move edits don't need to store the offset of the nodes relative to their parents,
  799. // so we don't calculate those. Thus we don't need to implement FindPos.
  800. // - We don't mark nodes "in order" since the marks are only needed by FindPos.
  801. // a)
  802. // Let x be the current node in the breadth-first search of T2.
  803. // Let y = parent(x).
  804. // Let z be the partner of parent(x) in M'. (note: we don't need z for insert)
  805. //
  806. // NOTE:
  807. // If we needed z then we would need to be updating M' as we encounter insertions.
  808. PNode w;
  809. bool hasPartner = match.TryGetPartnerInTree1(x, &w);
  810. PNode y;
  811. bool hasParent = comparer.TryGetParent(x, &y);
  812. if (!hasPartner)
  813. {
  814. // b) If x has no partner in M'.
  815. // i. k := FindPos(x)
  816. // ii. Append INS((w, a, value(x)), z, k) to E for a new identifier w.
  817. // iii. Add (w, x) to M' and apply INS((w, a, value(x)), z, k) to T1.
  818. edits.Add(Edit<PNode>(EditKind::Insert, /*node1*/nullptr, /*node2*/x));
  819. // NOTE:
  820. // We don't update M' here.
  821. }
  822. else if (hasParent)
  823. {
  824. // c) else if x is not a root
  825. // i. Let w be the partner of x in M', and let v = parent(w) in T1.
  826. PNode v = comparer.GetParent(w);
  827. // ii. if value(w) != value(x)
  828. // A. Append UPD(w, value(x)) to E
  829. // B. Apply UPD(w, value(x) to T1
  830. // Let the Comparer decide whether an update should be added to the edit list.
  831. // The Comparer defines what changes in node values it cares about.
  832. if (!comparer.ValuesEqual(w, x))
  833. {
  834. edits.Add(Edit<PNode>(EditKind::Update, /*node1*/w, /*node2*/x));
  835. }
  836. // If parents of w and x don't match, it's a move.
  837. // iii. if not (v, y) in M'
  838. // NOTE: The paper says (y, v) but that seems wrong since M': T1 -> T2 and w,v in T1 and x,y in T2.
  839. if (!match.Contains(v, y))
  840. {
  841. // A. Let z be the partner of y in M'. (NOTE: z not needed)
  842. // B. k := FindPos(x)
  843. // C. Append MOV(w, z, k)
  844. // D. Apply MOV(w, z, k) to T1
  845. edits.Add(Edit<PNode>(EditKind::Move, /*node1*/w, /*node2*/x));
  846. }
  847. }
  848. // d) AlignChildren(w, x)
  849. // NOTE: If we just applied an INS((w, a, value(x)), z, k) operation on tree T1
  850. // the newly created node w would have no children. So there is nothing to align.
  851. if (hasPartner)
  852. {
  853. AlignChildren(w, x);
  854. }
  855. }
  856. void AddDeletes()
  857. {
  858. // 3. Do a post-order traversal of T1.
  859. // a) Let w be the current node in the post-order traversal of T1.
  860. // b) If w has no partner in M' then append DEL(w) to E and apply DEL(w) to T1.
  861. //
  862. // NOTE: The fact that we haven't updated M' during the Insert phase
  863. // doesn't affect Delete phase. The original algorithm inserted new node n1 into T1
  864. // when an insertion INS(n1, n2) was detected. It also added (n1, n2) to M'.
  865. // Then in Delete phase n1 is visited but nothing is done since it has a partner n2 in M'.
  866. // Since we don't add n1 into T1, not adding (n1, n2) to M' doesn't affect the Delete phase.
  867. comparer.MapDescendants(Root1(), [&](PNode w)
  868. {
  869. if (!match.HasPartnerInTree2(w))
  870. {
  871. edits.Add(Edit<PNode>(EditKind::Delete, /*node1*/w, /*node2*/nullptr));
  872. }
  873. });
  874. }
  875. void AlignChildren(PNode w, PNode x)
  876. {
  877. Assert(comparer.TreesEqual(w, Root1()));
  878. Assert(comparer.TreesEqual(x, Root2()));
  879. Allocator* alloc = edits.GetAllocator();
  880. // Step 1
  881. // Make all children of w and and all children x "out of order"
  882. // NOTE: We don't need to mark nodes "in order".
  883. // Step 2
  884. // Let S1 be the sequence of children of w whose partner are children
  885. // of x and let S2 be the sequence of children of x whose partner are
  886. // children of w.
  887. NodeList s1(alloc), s2(alloc);
  888. if (!TryGetMatchedChildren(s1, w, x, [&](PNode e, PNode* partner) { return match.TryGetPartnerInTree2(e, partner); }) ||
  889. !TryGetMatchedChildren(s2, x, w, [&](PNode e, PNode* partner) { return match.TryGetPartnerInTree1(e, partner); }))
  890. {
  891. return;
  892. }
  893. // Step 3, 4
  894. // Define the function Equal(a,b) to be true if and only if (a,b) in M'
  895. // Let S <- LCS(S1, S2, Equal)
  896. NodeMap s(alloc);
  897. {
  898. LongestCommonSubsequence<Allocator> lcs(alloc, s1.Count(), s2.Count(), [&](int indexA, int indexB)
  899. {
  900. return match.Contains(s1.Item(indexA), s2.Item(indexB));
  901. });
  902. lcs.MapMatches([&](int indexA, int indexB)
  903. {
  904. s.AddNew(s1.Item(indexA), s2.Item(indexB));
  905. });
  906. }
  907. // Step 5
  908. // For each (a,b) in S, mark nodes a and b "in order"
  909. // NOTE: We don't need to mark nodes "in order".
  910. // Step 6
  911. // For each a in S1, b in S2 such that (a,b) in M but (a,b) not in S
  912. // (a) k <- FindPos(b)
  913. // (b) Append MOV(a,w,k) to E and apply MOV(a,w,k) to T1
  914. // (c) Mark a and b "in order"
  915. // NOTE: We don't mark nodes "in order".
  916. s1.Map([&](int index, PNode a)
  917. {
  918. PNode b;
  919. if (match.TryGetPartnerInTree2(a, &b) // (a,b) in M
  920. && comparer.GetParent(b) == x // => b in S2 since S2 == { b | parent(b) == x && parent(partner(b)) == w }
  921. && !ContainsPair(s, a, b)) // (a,b) not in S
  922. {
  923. Assert(comparer.TreesEqual(a, Root1()));
  924. Assert(comparer.TreesEqual(b, Root2()));
  925. edits.Add(Edit<PNode>(EditKind::Reorder, /*node1*/a, /*node2*/b));
  926. }
  927. });
  928. }
  929. // Helper: Get the sequence of children of x whose partner are children of y.
  930. template <class TryGetPartnerFunc>
  931. bool TryGetMatchedChildren(NodeList& nodes, PNode x, PNode y, const TryGetPartnerFunc& tryGetPartner)
  932. {
  933. Assert(nodes.Empty());
  934. comparer.MapChildren(x, [&](PNode e)
  935. {
  936. PNode partner;
  937. if (tryGetPartner(e, &partner) && comparer.GetParent(partner) == y)
  938. {
  939. nodes.Add(e);
  940. }
  941. });
  942. return !nodes.Empty();
  943. }
  944. static bool ContainsPair(const NodeMap& dict, PNode a, PNode b)
  945. {
  946. PNode value;
  947. return dict.TryGetValue(a, &value) && value == b;
  948. }
  949. };