pdf.ts 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. // @ts-ignore
  2. import pdfjs from 'pdfjs-dist';
  3. // @ts-ignore
  4. import pdfjsWorker from 'pdfjs-dist/build/pdf.worker.entry';
  5. import { ProgressType, ViewportType } from '../constants/type';
  6. import { objIsEmpty } from './utility';
  7. import { delay } from './time';
  8. pdfjs.GlobalWorkerOptions.workerSrc = pdfjsWorker;
  9. let normalizationRegex: any = null;
  10. const CHARACTERS_TO_NORMALIZE: {[index: string]: any} = {
  11. '\u2018': '\'', // Left single quotation mark
  12. '\u2019': '\'', // Right single quotation mark
  13. '\u201A': '\'', // Single low-9 quotation mark
  14. '\u201B': '\'', // Single high-reversed-9 quotation mark
  15. '\u201C': '"', // Left double quotation mark
  16. '\u201D': '"', // Right double quotation mark
  17. '\u201E': '"', // Double low-9 quotation mark
  18. '\u201F': '"', // Double high-reversed-9 quotation mark
  19. '\u00BC': '1/4', // Vulgar fraction one quarter
  20. '\u00BD': '1/2', // Vulgar fraction one half
  21. '\u00BE': '3/4', // Vulgar fraction three quarters
  22. };
  23. export const fetchPdf = async (
  24. src: string, cb?: (progress: ProgressType) => void,
  25. ): Promise<any> => {
  26. try {
  27. const loadingTask = pdfjs.getDocument({
  28. url: src,
  29. });
  30. if (cb) {
  31. loadingTask.onProgress = (progress: ProgressType): void => {
  32. cb(progress);
  33. };
  34. }
  35. const pdf = await loadingTask.promise;
  36. return pdf;
  37. } catch (e) {
  38. console.log(e);
  39. }
  40. return {};
  41. };
  42. export const renderTextLayer = async ({
  43. pdfPage,
  44. textLayer,
  45. viewport,
  46. }: {
  47. pdfPage: any;
  48. textLayer: HTMLElement;
  49. viewport: ViewportType;
  50. }): Promise<any> => {
  51. const textContent = await pdfPage.getTextContent();
  52. pdfjs.renderTextLayer({
  53. textContent,
  54. container: textLayer,
  55. viewport,
  56. textDivs: [],
  57. });
  58. };
  59. export const renderPdfPage = async ({
  60. rootEle,
  61. pdfPage,
  62. viewport,
  63. }: {
  64. rootEle: HTMLElement;
  65. pdfPage: any;
  66. viewport: ViewportType;
  67. }): Promise<any> => {
  68. if (rootEle) {
  69. const canvas: HTMLCanvasElement = rootEle.querySelectorAll('canvas')[0] as HTMLCanvasElement;
  70. const textLayer: HTMLDivElement = rootEle.querySelector('[data-id="text-layer"]') as HTMLDivElement;
  71. if (canvas) {
  72. const context: CanvasRenderingContext2D = canvas.getContext('2d') as CanvasRenderingContext2D;
  73. canvas.height = viewport.height;
  74. canvas.width = viewport.width;
  75. const renderContext = {
  76. canvasContext: context,
  77. viewport,
  78. };
  79. if (!objIsEmpty(pdfPage)) {
  80. const renderTask = pdfPage.render(renderContext);
  81. await renderTask.promise;
  82. }
  83. textLayer.innerHTML = '';
  84. await delay(200);
  85. await renderTextLayer({
  86. pdfPage,
  87. textLayer,
  88. viewport,
  89. });
  90. }
  91. }
  92. };
  93. export const normalize = (text: string): string => {
  94. if (!normalizationRegex) {
  95. // Compile the regular expression for text normalization once.
  96. const replace = Object.keys(CHARACTERS_TO_NORMALIZE).join('');
  97. normalizationRegex = new RegExp(`[${replace}]`, 'g');
  98. }
  99. return text.replace(normalizationRegex, ch => CHARACTERS_TO_NORMALIZE[ch]);
  100. };
  101. export const calcFindPhraseMatch = (pageContent: string, query: string): number[] => {
  102. const matches = [];
  103. const queryLen = query.length;
  104. let matchIdx = -queryLen;
  105. if (pageContent) {
  106. while (query) {
  107. matchIdx = pageContent.indexOf(query, matchIdx + queryLen);
  108. if (matchIdx === -1) break;
  109. matches.push(matchIdx);
  110. }
  111. }
  112. return matches;
  113. };
  114. export const convertMatches = (
  115. queryString: string,
  116. matchIndex: number,
  117. textContentItem: any[],
  118. ): Record<string, any> => {
  119. let i = 0;
  120. let iIndex = 0;
  121. const end = textContentItem.length - 1;
  122. const queryLen = queryString.length;
  123. // Loop over the divIdxs.
  124. while (i !== end && matchIndex >= (iIndex + textContentItem[i].length)) {
  125. iIndex += textContentItem[i].length;
  126. i += 1;
  127. }
  128. if (i === textContentItem.length) {
  129. console.error('Could not find a matching mapping');
  130. }
  131. const match: Record<string, any> = {
  132. begin: {
  133. divIdx: i,
  134. offset: matchIndex - iIndex,
  135. },
  136. };
  137. // Calculate the end position.
  138. // eslint-disable-next-line no-param-reassign
  139. matchIndex += queryLen;
  140. // Somewhat the same array as above, but use > instead of >= to get
  141. // the end position right.
  142. while (i !== end && matchIndex > (iIndex + textContentItem[i].length)) {
  143. iIndex += textContentItem[i].length;
  144. i += 1;
  145. }
  146. match.end = {
  147. divIdx: i,
  148. offset: matchIndex - iIndex,
  149. };
  150. return match;
  151. };
  152. export const getPdfPage = async (pdf: any, pageNum: number): Promise<any> => {
  153. const page = await pdf.getPage(pageNum);
  154. return page;
  155. };