LineOffsetCache.h 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269
  1. //-------------------------------------------------------------------------------------------------------
  2. // Copyright (C) Microsoft. All rights reserved.
  3. // Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
  4. //-------------------------------------------------------------------------------------------------------
  5. #pragma once
  6. namespace JsUtil
  7. {
  8. template <typename TAllocator>
  9. class LineOffsetCache
  10. {
  11. public:
  12. // Stores line offset information for a line in the source info. The index of this
  13. // item in the LineOffsetCacheList determines which line it is.
  14. struct LineOffsetCacheItem
  15. {
  16. // The character offset of where the line begins starting from the start of the source info.
  17. charcount_t characterOffset;
  18. // The byte offset of where the line begins starting from the start of the source info's character buffer (UTF8).
  19. charcount_t byteOffset;
  20. };
  21. private:
  22. typedef List<LineOffsetCacheItem, TAllocator, true /*isLeaf*/> LineOffsetCacheList;
  23. typedef ReadOnlyList<LineOffsetCacheItem> LineOffsetCacheReadOnlyList;
  24. public:
  25. static int FindLineForCharacterOffset(
  26. _In_z_ LPCUTF8 sourceStartCharacter,
  27. _In_z_ LPCUTF8 sourceEndCharacter,
  28. charcount_t &inOutLineCharOffset,
  29. charcount_t &inOutByteOffset,
  30. charcount_t characterOffset)
  31. {
  32. int lastLine = 0;
  33. while (FindNextLine(sourceStartCharacter, sourceEndCharacter, inOutLineCharOffset, inOutByteOffset, characterOffset))
  34. {
  35. lastLine++;
  36. }
  37. return lastLine;
  38. }
  39. LineOffsetCache(TAllocator* allocator,
  40. _In_z_ LPCUTF8 sourceStartCharacter,
  41. _In_z_ LPCUTF8 sourceEndCharacter,
  42. charcount_t startingCharacterOffset = 0,
  43. charcount_t startingByteOffset = 0) :
  44. allocator(allocator),
  45. isCacheBuilt(false)
  46. {
  47. AssertMsg(allocator, "An allocator must be supplied to the cache for allocation of items.");
  48. LineOffsetCacheList *list = AllocatorNew(TAllocator, allocator, LineOffsetCacheList, allocator);
  49. this->lineOffsetCacheList = list;
  50. this->BuildCache(list, sourceStartCharacter, sourceEndCharacter, startingCharacterOffset, startingByteOffset);
  51. }
  52. LineOffsetCache(TAllocator *allocator,
  53. _In_reads_(numberOfLines) const LineOffsetCacheItem *lines,
  54. __in int numberOfLines) :
  55. allocator(allocator),
  56. isCacheBuilt(false)
  57. {
  58. this->lineOffsetCacheList = LineOffsetCacheReadOnlyList::New(allocator, (LineOffsetCacheItem *)lines, numberOfLines);
  59. }
  60. ~LineOffsetCache()
  61. {
  62. if (this->lineOffsetCacheList != nullptr)
  63. {
  64. this->lineOffsetCacheList->Delete();
  65. }
  66. }
  67. // outLineCharOffset - The character offset of the start of the line returned
  68. int GetLineForCharacterOffset(charcount_t characterOffset, charcount_t *outLineCharOffset, charcount_t *outByteOffset)
  69. {
  70. Assert(this->lineOffsetCacheList->Count() > 0);
  71. // The list is sorted, so binary search to find the line info.
  72. int closestIndex = -1;
  73. int minRange = INT_MAX;
  74. this->lineOffsetCacheList->BinarySearch([&](const LineOffsetCacheItem& item, int index)
  75. {
  76. int offsetRange = characterOffset - item.characterOffset;
  77. if (offsetRange >= 0)
  78. {
  79. if (offsetRange < minRange)
  80. {
  81. // There are potentially many lines with starting offsets greater than the one we're searching
  82. // for. As a result, we should track which index we've encountered so far that is the closest
  83. // to the offset we're looking for without going under. This will find the line that contains
  84. // the offset.
  85. closestIndex = index;
  86. minRange = offsetRange;
  87. }
  88. // Search lower to see if we can find a closer index.
  89. return -1;
  90. }
  91. else
  92. {
  93. // Search higher to get into a range that is greater than the offset.
  94. return 1;
  95. }
  96. // Note that we purposely don't return 0 (==) here. We want the search to end in failure (-1) because
  97. // we're searching for the closest element, not necessarily an exact element offset. Exact offsets
  98. // are possible when the offset we're searching for is the first character of the line, but that will
  99. // be handled by the if statement above.
  100. });
  101. if (closestIndex >= 0)
  102. {
  103. LineOffsetCacheItem lastItem = this->lineOffsetCacheList->Item(closestIndex);
  104. if (outLineCharOffset != nullptr)
  105. {
  106. *outLineCharOffset = lastItem.characterOffset;
  107. }
  108. if (outByteOffset != nullptr)
  109. {
  110. *outByteOffset = lastItem.byteOffset;
  111. }
  112. }
  113. return closestIndex;
  114. }
  115. charcount_t GetCharacterOffsetForLine(charcount_t line, charcount_t *outByteOffset) const
  116. {
  117. AssertMsg(line < this->GetLineCount(), "Invalid line value passed in.");
  118. LineOffsetCacheItem item = this->lineOffsetCacheList->Item(line);
  119. if (outByteOffset != nullptr)
  120. {
  121. *outByteOffset = item.byteOffset;
  122. }
  123. return item.characterOffset;
  124. }
  125. uint32 GetLineCount() const
  126. {
  127. AssertMsg(this->lineOffsetCacheList != nullptr, "The list was either not set from the ByteCode or not created.");
  128. return this->lineOffsetCacheList->Count();
  129. }
  130. const LineOffsetCacheItem* GetItems()
  131. {
  132. return this->lineOffsetCacheList->GetBuffer();
  133. }
  134. private:
  135. static bool FindNextLine(_In_z_ LPCUTF8 &currentSourcePosition, _In_z_ LPCUTF8 sourceEndCharacter, charcount_t &inOutCharacterOffset, charcount_t &inOutByteOffset, charcount_t maxCharacterOffset = UINT32_MAX)
  136. {
  137. charcount_t currentCharacterOffset = inOutCharacterOffset;
  138. charcount_t currentByteOffset = inOutByteOffset;
  139. utf8::DecodeOptions options = utf8::doAllowThreeByteSurrogates;
  140. while (currentSourcePosition < sourceEndCharacter)
  141. {
  142. LPCUTF8 previousCharacter = currentSourcePosition;
  143. // Decode from UTF8 to wide char. Note that Decode will advance the current character by 1 at least.
  144. char16 decodedCharacter = utf8::Decode(currentSourcePosition, sourceEndCharacter, options);
  145. bool wasLineEncountered = false;
  146. switch (decodedCharacter)
  147. {
  148. case _u('\r'):
  149. // Check if the next character is a '\n'. If so, consume that character as well
  150. // (consider as one line).
  151. if (*currentSourcePosition == '\n')
  152. {
  153. ++currentSourcePosition;
  154. ++currentCharacterOffset;
  155. }
  156. // Intentional fall-through.
  157. case _u('\n'):
  158. case 0x2028:
  159. case 0x2029:
  160. // Found a new line.
  161. wasLineEncountered = true;
  162. break;
  163. }
  164. // Move to the next character offset.
  165. ++currentCharacterOffset;
  166. // Count the current byte offset we're at in the UTF-8 buffer.
  167. // The character size can be > 1 for unicode characters.
  168. currentByteOffset += static_cast<int>(currentSourcePosition - previousCharacter);
  169. if (wasLineEncountered)
  170. {
  171. inOutCharacterOffset = currentCharacterOffset;
  172. inOutByteOffset = currentByteOffset;
  173. return true;
  174. }
  175. else if (currentCharacterOffset >= maxCharacterOffset)
  176. {
  177. return false;
  178. }
  179. }
  180. return false;
  181. }
  182. // Builds the cache of line offsets from the passed in source.
  183. void BuildCache(_In_ LineOffsetCacheList *list, _In_z_ LPCUTF8 sourceStartCharacter,
  184. _In_z_ LPCUTF8 sourceEndCharacter,
  185. charcount_t startingCharacterOffset = 0,
  186. charcount_t startingByteOffset = 0)
  187. {
  188. AssertMsg(sourceStartCharacter, "The source start character passed in is null.");
  189. AssertMsg(sourceEndCharacter, "The source end character passed in is null.");
  190. AssertMsg(sourceStartCharacter <= sourceEndCharacter, "The source start character should not be beyond the source end character.");
  191. AssertMsg(!this->isCacheBuilt, "The cache is already built.");
  192. // Add the first line in the cache list.
  193. this->AddLine(list, startingCharacterOffset, startingByteOffset);
  194. while (FindNextLine(sourceStartCharacter, sourceEndCharacter, startingCharacterOffset, startingByteOffset))
  195. {
  196. this->AddLine(list, startingCharacterOffset, startingByteOffset);
  197. }
  198. isCacheBuilt = true;
  199. }
  200. // Tracks a new line offset in the cache.
  201. void AddLine(_In_ LineOffsetCacheList *list, int characterOffset, int byteOffset)
  202. {
  203. AssertMsg(characterOffset >= 0, "The character offset is invalid.");
  204. AssertMsg(byteOffset >= 0, "The byte offset is invalid.");
  205. LineOffsetCacheItem item;
  206. item.characterOffset = characterOffset;
  207. item.byteOffset = byteOffset;
  208. list->Add(item);
  209. #if DBG
  210. if (list->Count() > 1)
  211. {
  212. // Ensure that the list remains sorted during insertion.
  213. LineOffsetCacheItem previousItem = list->Item(list->Count() - 2);
  214. AssertMsg(item.characterOffset > previousItem.characterOffset, "The character offsets must be inserted in increasing order per line.");
  215. AssertMsg(item.byteOffset > previousItem.byteOffset, "The byte offsets must be inserted in increasing order per line.");
  216. }
  217. #endif // DBG
  218. }
  219. private:
  220. FieldNoBarrier(TAllocator*) allocator;
  221. // Line offset cache list used for quickly finding line/column offsets.
  222. Field(LineOffsetCacheReadOnlyList*) lineOffsetCacheList;
  223. Field(bool) isCacheBuilt;
  224. };
  225. }