structured-text.h 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
  1. #ifndef MUPDF_FITZ_STRUCTURED_TEXT_H
  2. #define MUPDF_FITZ_STRUCTURED_TEXT_H
  3. #include "mupdf/fitz/system.h"
  4. #include "mupdf/fitz/context.h"
  5. #include "mupdf/fitz/geometry.h"
  6. #include "mupdf/fitz/font.h"
  7. #include "mupdf/fitz/colorspace.h"
  8. #include "mupdf/fitz/image.h"
  9. #include "mupdf/fitz/output.h"
  10. #include "mupdf/fitz/device.h"
  11. /*
  12. Text extraction device: Used for searching, format conversion etc.
  13. (In development - Subject to change in future versions)
  14. */
  15. typedef struct fz_stext_style_s fz_stext_style;
  16. typedef struct fz_stext_char_s fz_stext_char;
  17. typedef struct fz_stext_span_s fz_stext_span;
  18. typedef struct fz_stext_line_s fz_stext_line;
  19. typedef struct fz_stext_block_s fz_stext_block;
  20. typedef struct fz_image_block_s fz_image_block;
  21. typedef struct fz_page_block_s fz_page_block;
  22. typedef struct fz_stext_sheet_s fz_stext_sheet;
  23. typedef struct fz_stext_page_s fz_stext_page;
  24. /*
  25. FZ_STEXT_PRESERVE_LIGATURES: If this option is activated ligatures
  26. are passed through to the application in their original form. If
  27. this option is deactivated ligatures are expanded into their
  28. constituent parts, e.g. the ligature ffi is expanded into three
  29. separate characters f, f and i.
  30. FZ_STEXT_PRESERVE_WHITESPACE: If this option is activated whitespace
  31. is passed through to the application in its original form. If this
  32. option is deactivated any type of horizontal whitespace (including
  33. horizontal tabs) will be replaced with space characters of variable
  34. width.
  35. */
  36. enum
  37. {
  38. FZ_STEXT_PRESERVE_LIGATURES = 1,
  39. FZ_STEXT_PRESERVE_WHITESPACE = 2,
  40. };
  41. /*
  42. fz_stext_sheet: A text sheet contains a list of distinct text styles
  43. used on a page (or a series of pages).
  44. */
  45. struct fz_stext_sheet_s
  46. {
  47. int maxid;
  48. fz_stext_style *style;
  49. };
  50. /*
  51. fz_stext_style: A text style contains details of a distinct text style
  52. used on a page.
  53. */
  54. struct fz_stext_style_s
  55. {
  56. fz_stext_style *next;
  57. int id;
  58. fz_font *font;
  59. float size;
  60. int wmode;
  61. int script;
  62. /* Ascender and Descender only have the conventional sense in
  63. * horizontal mode; in vertical mode they are rotated too - they are
  64. * the maximum and minimum bounds respectively. */
  65. float ascender;
  66. float descender;
  67. /* etc... */
  68. };
  69. /*
  70. fz_stext_page: A text page is a list of page blocks, together with
  71. an overall bounding box.
  72. */
  73. struct fz_stext_page_s
  74. {
  75. fz_rect mediabox;
  76. int len, cap;
  77. fz_page_block *blocks;
  78. fz_stext_page *next;
  79. };
  80. /*
  81. fz_page_block: A page block is a typed block pointer.
  82. */
  83. struct fz_page_block_s
  84. {
  85. int type;
  86. union
  87. {
  88. fz_stext_block *text;
  89. fz_image_block *image;
  90. } u;
  91. };
  92. enum
  93. {
  94. FZ_PAGE_BLOCK_TEXT = 0,
  95. FZ_PAGE_BLOCK_IMAGE = 1
  96. };
  97. /*
  98. fz_stext_block: A text block is a list of lines of text. In typical
  99. cases this may correspond to a paragraph or a column of text. A
  100. collection of blocks makes up a page.
  101. */
  102. struct fz_stext_block_s
  103. {
  104. fz_rect bbox;
  105. int len, cap;
  106. fz_stext_line *lines;
  107. };
  108. /*
  109. fz_image_block: An image block is an image, together with the list of lines of text. In typical
  110. cases this may correspond to a paragraph or a column of text. A
  111. collection of blocks makes up a page.
  112. */
  113. struct fz_image_block_s
  114. {
  115. fz_rect bbox;
  116. fz_matrix mat;
  117. fz_image *image;
  118. fz_colorspace *cspace;
  119. float colors[FZ_MAX_COLORS];
  120. };
  121. /*
  122. fz_stext_line: A text line is a list of text spans, with the same
  123. baseline. In typical cases this should correspond (as expected) to
  124. complete lines of text. A collection of lines makes up a block.
  125. */
  126. struct fz_stext_line_s
  127. {
  128. fz_stext_span *first_span, *last_span;
  129. /* Cached information */
  130. float distance; /* Perpendicular distance from previous line */
  131. fz_rect bbox;
  132. void *region; /* Opaque value for matching line masks */
  133. };
  134. /*
  135. fz_stext_span: A text span is a list of characters that share a common
  136. baseline/transformation. In typical cases a single span may be enough
  137. to represent a complete line. In cases where the text has big gaps in
  138. it (perhaps as it crosses columns or tables), a line may be represented
  139. by multiple spans.
  140. */
  141. struct fz_stext_span_s
  142. {
  143. int len, cap;
  144. fz_stext_char *text;
  145. fz_point min; /* Device space */
  146. fz_point max; /* Device space */
  147. int wmode; /* 0 for horizontal, 1 for vertical */
  148. fz_matrix transform; /* e and f are always 0 here */
  149. /* Ascender_max and Descender_min only have the conventional sense in
  150. * horizontal mode; in vertical mode they are rotated too - they are
  151. * the maximum and minimum bounds respectively. */
  152. float ascender_max; /* Document space */
  153. float descender_min; /* Document space */
  154. fz_rect bbox; /* Device space */
  155. /* Cached information */
  156. float base_offset; /* Perpendicular distance from baseline of line */
  157. float spacing; /* Distance along baseline from previous span in this line (or 0 if first) */
  158. int column; /* If non zero, the column that it's in */
  159. float column_width; /* Percentage */
  160. int align; /* 0 = left, 1 = centre, 2 = right */
  161. float indent; /* The indent position for this column. */
  162. fz_stext_span *next;
  163. };
  164. /*
  165. fz_stext_char: A text char is a unicode character, the style in which
  166. is appears, and the point at which it is positioned. Transform
  167. (and hence bbox) information is given by the enclosing span.
  168. */
  169. struct fz_stext_char_s
  170. {
  171. fz_point p; /* Device space */
  172. int c;
  173. fz_stext_style *style;
  174. };
  175. typedef struct fz_char_and_box_s fz_char_and_box;
  176. struct fz_char_and_box_s
  177. {
  178. int c;
  179. fz_rect bbox;
  180. };
  181. extern const char *fz_stext_options_usage;
  182. fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stext_page *page, int idx);
  183. /*
  184. fz_stext_char_bbox: Return the bbox of a text char. Calculated from
  185. the supplied enclosing span.
  186. bbox: A place to store the bbox
  187. span: The enclosing span
  188. idx: The index of the char within the span
  189. Returns bbox (updated)
  190. Does not throw exceptions
  191. */
  192. fz_rect *fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_span *span, int idx);
  193. /*
  194. fz_new_stext_sheet: Create an empty style sheet.
  195. The style sheet is filled out by the text device, creating
  196. one style for each unique font, color, size combination that
  197. is used.
  198. */
  199. fz_stext_sheet *fz_new_stext_sheet(fz_context *ctx);
  200. void fz_drop_stext_sheet(fz_context *ctx, fz_stext_sheet *sheet);
  201. /*
  202. fz_new_stext_page: Create an empty text page.
  203. The text page is filled out by the text device to contain the blocks,
  204. lines and spans of text on the page.
  205. mediabox: optional mediabox information.
  206. */
  207. fz_stext_page *fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox);
  208. void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page);
  209. void fz_analyze_text(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page);
  210. /*
  211. fz_print_stext_sheet: Output a text sheet to a file as CSS.
  212. */
  213. void fz_print_stext_sheet(fz_context *ctx, fz_output *out, fz_stext_sheet *sheet);
  214. /*
  215. fz_print_stext_page_html: Output a page to a file in HTML format.
  216. */
  217. void fz_print_stext_page_html(fz_context *ctx, fz_output *out, fz_stext_page *page);
  218. /*
  219. fz_print_stext_page_xml: Output a page to a file in XML format.
  220. */
  221. void fz_print_stext_page_xml(fz_context *ctx, fz_output *out, fz_stext_page *page);
  222. /*
  223. fz_print_stext_page: Output a page to a file in UTF-8 format.
  224. */
  225. void fz_print_stext_page(fz_context *ctx, fz_output *out, fz_stext_page *page);
  226. /*
  227. fz_search_stext_page: Search for occurrence of 'needle' in text page.
  228. Return the number of hits and store hit bboxes in the passed in array.
  229. NOTE: This is an experimental interface and subject to change without notice.
  230. */
  231. int fz_search_stext_page(fz_context *ctx, fz_stext_page *text, const char *needle, fz_rect *hit_bbox, int hit_max);
  232. /*
  233. fz_highlight_selection: Return a list of rectangles to highlight given a selection rectangle.
  234. NOTE: This is an experimental interface and subject to change without notice.
  235. */
  236. int fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, fz_rect *hit_bbox, int hit_max);
  237. /*
  238. fz_copy_selection: Return a newly allocated UTF-8 string with the text for a given selection rectangle.
  239. NOTE: This is an experimental interface and subject to change without notice.
  240. */
  241. char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect);
  242. /*
  243. struct fz_stext_options: Options for creating a pixmap and draw device.
  244. */
  245. typedef struct fz_stext_options_s fz_stext_options;
  246. struct fz_stext_options_s
  247. {
  248. int flags;
  249. };
  250. /*
  251. fz_parse_stext_options: Parse stext device options from a comma separated key-value string.
  252. */
  253. fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string);
  254. /*
  255. fz_new_stext_device: Create a device to extract the text on a page.
  256. Gather and sort the text on a page into spans of uniform style,
  257. arranged into lines and blocks by reading order. The reading order
  258. is determined by various heuristics, so may not be accurate.
  259. sheet: The text sheet to which styles should be added. This can
  260. either be a newly created (empty) text sheet, or one containing
  261. styles from a previous text device. The same sheet cannot be used
  262. in multiple threads simultaneously.
  263. page: The text page to which content should be added. This will
  264. usually be a newly created (empty) text page, but it can be one
  265. containing data already (for example when merging multiple pages,
  266. or watermarking).
  267. options: Options to configure the stext device.
  268. */
  269. fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *options);
  270. #endif