pdf.ts 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. import pdfjs from 'pdfjs-dist';
  2. import { delay } from './time';
  3. pdfjs.GlobalWorkerOptions.workerSrc = '/static/build/pdf.worker.min.js';
  4. pdfjs.cMapUrl = '/static/cmaps/';
  5. pdfjs.cMapPacked = true;
  6. let normalizationRegex: any = null;
  7. const CHARACTERS_TO_NORMALIZE: { [index: string]: any } = {
  8. '\u2018': "'", // Left single quotation mark
  9. '\u2019': "'", // Right single quotation mark
  10. '\u201A': "'", // Single low-9 quotation mark
  11. '\u201B': "'", // Single high-reversed-9 quotation mark
  12. '\u201C': '"', // Left double quotation mark
  13. '\u201D': '"', // Right double quotation mark
  14. '\u201E': '"', // Double low-9 quotation mark
  15. '\u201F': '"', // Double high-reversed-9 quotation mark
  16. '\u00BC': '1/4', // Vulgar fraction one quarter
  17. '\u00BD': '1/2', // Vulgar fraction one half
  18. '\u00BE': '3/4', // Vulgar fraction three quarters
  19. };
  20. export const fetchPdf = async (
  21. src: string,
  22. cb?: (progress: ProgressType) => void
  23. ): Promise<any> => {
  24. try {
  25. const loadingTask = pdfjs.getDocument({
  26. url: src,
  27. cMapUrl: '/static/cmaps/',
  28. cMapPacked: true,
  29. });
  30. if (cb) {
  31. loadingTask.onProgress = (progress: ProgressType): void => {
  32. cb(progress);
  33. };
  34. }
  35. const pdf = await loadingTask.promise;
  36. return pdf;
  37. } catch (e) {
  38. console.log(e);
  39. }
  40. return {};
  41. };
  42. export const renderTextLayer = async ({
  43. pdfPage,
  44. textLayer,
  45. viewport,
  46. setTextDivs,
  47. }: {
  48. pdfPage: any;
  49. textLayer: HTMLElement;
  50. viewport: ViewportType;
  51. setTextDivs?: (elements: HTMLElement[]) => void;
  52. }): Promise<any> => {
  53. const textContent = await pdfPage.getTextContent();
  54. const textDivs: any[] = [];
  55. await pdfjs.renderTextLayer({
  56. textContent,
  57. container: textLayer,
  58. viewport,
  59. textDivs,
  60. });
  61. if (setTextDivs) {
  62. setTextDivs(textDivs);
  63. }
  64. };
  65. export const renderPdfPage = async ({
  66. rootEle,
  67. pdfPage,
  68. viewport,
  69. setRenderTask,
  70. setTextDivs,
  71. }: {
  72. rootEle: HTMLElement;
  73. pdfPage: any;
  74. viewport: ViewportType;
  75. setRenderTask: any;
  76. setTextDivs: (elements: HTMLElement[]) => void;
  77. }): Promise<any> => {
  78. if (rootEle) {
  79. const canvas: HTMLCanvasElement = rootEle.querySelectorAll(
  80. 'canvas'
  81. )[0] as HTMLCanvasElement;
  82. const textLayer: HTMLDivElement = rootEle.querySelector(
  83. '[data-id="text-layer"]'
  84. ) as HTMLDivElement;
  85. if (canvas) {
  86. const context: CanvasRenderingContext2D = canvas.getContext(
  87. '2d'
  88. ) as CanvasRenderingContext2D;
  89. canvas.height = viewport.height;
  90. canvas.width = viewport.width;
  91. const renderContext = {
  92. canvasContext: context,
  93. viewport,
  94. };
  95. if (pdfPage) {
  96. const renderTask = pdfPage.render(renderContext);
  97. setRenderTask(renderTask);
  98. await renderTask.promise.catch((reason: string) => {
  99. console.log(`stopped ${reason}`);
  100. });
  101. }
  102. textLayer.innerHTML = '';
  103. await delay(200);
  104. await renderTextLayer({
  105. pdfPage,
  106. textLayer,
  107. viewport,
  108. setTextDivs,
  109. });
  110. }
  111. }
  112. };
  113. export const normalize = (text: string): string => {
  114. if (!normalizationRegex) {
  115. // Compile the regular expression for text normalization once.
  116. const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join('');
  117. normalizationRegex = new RegExp(`[${replace}]`, 'g');
  118. }
  119. return text.replace(normalizationRegex, ch => CHARACTERS_TO_NORMALIZE[ch]);
  120. };
  121. export const calculatePhraseMatch = (
  122. pageContent: string,
  123. query: string
  124. ): number[] => {
  125. const matches = [];
  126. const queryLen = query.length;
  127. let matchIdx = -queryLen;
  128. if (pageContent) {
  129. while (query) {
  130. matchIdx = pageContent.indexOf(query, matchIdx + queryLen);
  131. if (matchIdx === -1) break;
  132. matches.push(matchIdx);
  133. }
  134. }
  135. return matches;
  136. };
  137. export const convertMatches = (
  138. queryString: string,
  139. matchIndex: number,
  140. textContentItem: any[]
  141. ): Record<string, any> => {
  142. let i = 0;
  143. let iIndex = 0;
  144. const end = textContentItem.length - 1;
  145. const queryLen = queryString.length;
  146. // Loop over the divIdxs.
  147. while (i !== end && matchIndex >= iIndex + textContentItem[i].length) {
  148. iIndex += textContentItem[i].length;
  149. i += 1;
  150. }
  151. if (i === textContentItem.length) {
  152. console.error('Could not find a matching mapping');
  153. }
  154. const match: Record<string, any> = {
  155. begin: {
  156. divIdx: i,
  157. offset: matchIndex - iIndex,
  158. },
  159. };
  160. // Calculate the end position.
  161. // eslint-disable-next-line no-param-reassign
  162. matchIndex += queryLen;
  163. // Somewhat the same array as above, but use > instead of >= to get
  164. // the end position right.
  165. while (i !== end && matchIndex > iIndex + textContentItem[i].length) {
  166. iIndex += textContentItem[i].length;
  167. i += 1;
  168. }
  169. match.end = {
  170. divIdx: i,
  171. offset: matchIndex - iIndex,
  172. };
  173. return match;
  174. };
  175. export const getPdfPage = async (pdf: any, pageNum: number): Promise<any> => {
  176. const page = await pdf.getPage(pageNum);
  177. return page;
  178. };