123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325 |
- #ifndef MUPDF_FITZ_STRUCTURED_TEXT_H
- #define MUPDF_FITZ_STRUCTURED_TEXT_H
- #include "mupdf/fitz/system.h"
- #include "mupdf/fitz/context.h"
- #include "mupdf/fitz/geometry.h"
- #include "mupdf/fitz/font.h"
- #include "mupdf/fitz/colorspace.h"
- #include "mupdf/fitz/image.h"
- #include "mupdf/fitz/output.h"
- #include "mupdf/fitz/device.h"
- /*
- Text extraction device: Used for searching, format conversion etc.
- (In development - Subject to change in future versions)
- */
- typedef struct fz_stext_style_s fz_stext_style;
- typedef struct fz_stext_char_s fz_stext_char;
- typedef struct fz_stext_span_s fz_stext_span;
- typedef struct fz_stext_line_s fz_stext_line;
- typedef struct fz_stext_block_s fz_stext_block;
- typedef struct fz_image_block_s fz_image_block;
- typedef struct fz_page_block_s fz_page_block;
- typedef struct fz_stext_sheet_s fz_stext_sheet;
- typedef struct fz_stext_page_s fz_stext_page;
- /*
- FZ_STEXT_PRESERVE_LIGATURES: If this option is activated ligatures
- are passed through to the application in their original form. If
- this option is deactivated ligatures are expanded into their
- constituent parts, e.g. the ligature ffi is expanded into three
- separate characters f, f and i.
- FZ_STEXT_PRESERVE_WHITESPACE: If this option is activated whitespace
- is passed through to the application in its original form. If this
- option is deactivated any type of horizontal whitespace (including
- horizontal tabs) will be replaced with space characters of variable
- width.
- */
- enum
- {
- FZ_STEXT_PRESERVE_LIGATURES = 1,
- FZ_STEXT_PRESERVE_WHITESPACE = 2,
- };
- /*
- fz_stext_sheet: A text sheet contains a list of distinct text styles
- used on a page (or a series of pages).
- */
- struct fz_stext_sheet_s
- {
- int maxid;
- fz_stext_style *style;
- };
- /*
- fz_stext_style: A text style contains details of a distinct text style
- used on a page.
- */
- struct fz_stext_style_s
- {
- fz_stext_style *next;
- int id;
- fz_font *font;
- float size;
- int wmode;
- int script;
- /* Ascender and Descender only have the conventional sense in
- * horizontal mode; in vertical mode they are rotated too - they are
- * the maximum and minimum bounds respectively. */
- float ascender;
- float descender;
- /* etc... */
- };
- /*
- fz_stext_page: A text page is a list of page blocks, together with
- an overall bounding box.
- */
- struct fz_stext_page_s
- {
- fz_rect mediabox;
- int len, cap;
- fz_page_block *blocks;
- fz_stext_page *next;
- };
- /*
- fz_page_block: A page block is a typed block pointer.
- */
- struct fz_page_block_s
- {
- int type;
- union
- {
- fz_stext_block *text;
- fz_image_block *image;
- } u;
- };
- enum
- {
- FZ_PAGE_BLOCK_TEXT = 0,
- FZ_PAGE_BLOCK_IMAGE = 1
- };
- /*
- fz_stext_block: A text block is a list of lines of text. In typical
- cases this may correspond to a paragraph or a column of text. A
- collection of blocks makes up a page.
- */
- struct fz_stext_block_s
- {
- fz_rect bbox;
- int len, cap;
- fz_stext_line *lines;
- };
- /*
- fz_image_block: An image block is an image, together with the list of lines of text. In typical
- cases this may correspond to a paragraph or a column of text. A
- collection of blocks makes up a page.
- */
- struct fz_image_block_s
- {
- fz_rect bbox;
- fz_matrix mat;
- fz_image *image;
- fz_colorspace *cspace;
- float colors[FZ_MAX_COLORS];
- };
- /*
- fz_stext_line: A text line is a list of text spans, with the same
- baseline. In typical cases this should correspond (as expected) to
- complete lines of text. A collection of lines makes up a block.
- */
- struct fz_stext_line_s
- {
- fz_stext_span *first_span, *last_span;
- /* Cached information */
- float distance; /* Perpendicular distance from previous line */
- fz_rect bbox;
- void *region; /* Opaque value for matching line masks */
- };
- /*
- fz_stext_span: A text span is a list of characters that share a common
- baseline/transformation. In typical cases a single span may be enough
- to represent a complete line. In cases where the text has big gaps in
- it (perhaps as it crosses columns or tables), a line may be represented
- by multiple spans.
- */
- struct fz_stext_span_s
- {
- int len, cap;
- fz_stext_char *text;
- fz_point min; /* Device space */
- fz_point max; /* Device space */
- int wmode; /* 0 for horizontal, 1 for vertical */
- fz_matrix transform; /* e and f are always 0 here */
- /* Ascender_max and Descender_min only have the conventional sense in
- * horizontal mode; in vertical mode they are rotated too - they are
- * the maximum and minimum bounds respectively. */
- float ascender_max; /* Document space */
- float descender_min; /* Document space */
- fz_rect bbox; /* Device space */
- /* Cached information */
- float base_offset; /* Perpendicular distance from baseline of line */
- float spacing; /* Distance along baseline from previous span in this line (or 0 if first) */
- int column; /* If non zero, the column that it's in */
- float column_width; /* Percentage */
- int align; /* 0 = left, 1 = centre, 2 = right */
- float indent; /* The indent position for this column. */
- fz_stext_span *next;
- };
- /*
- fz_stext_char: A text char is a unicode character, the style in which
- is appears, and the point at which it is positioned. Transform
- (and hence bbox) information is given by the enclosing span.
- */
- struct fz_stext_char_s
- {
- fz_point p; /* Device space */
- int c;
- fz_stext_style *style;
- };
- typedef struct fz_char_and_box_s fz_char_and_box;
- struct fz_char_and_box_s
- {
- int c;
- fz_rect bbox;
- };
- extern const char *fz_stext_options_usage;
- fz_char_and_box *fz_stext_char_at(fz_context *ctx, fz_char_and_box *cab, fz_stext_page *page, int idx);
- /*
- fz_stext_char_bbox: Return the bbox of a text char. Calculated from
- the supplied enclosing span.
- bbox: A place to store the bbox
- span: The enclosing span
- idx: The index of the char within the span
- Returns bbox (updated)
- Does not throw exceptions
- */
- fz_rect *fz_stext_char_bbox(fz_context *ctx, fz_rect *bbox, fz_stext_span *span, int idx);
- /*
- fz_new_stext_sheet: Create an empty style sheet.
- The style sheet is filled out by the text device, creating
- one style for each unique font, color, size combination that
- is used.
- */
- fz_stext_sheet *fz_new_stext_sheet(fz_context *ctx);
- void fz_drop_stext_sheet(fz_context *ctx, fz_stext_sheet *sheet);
- /*
- fz_new_stext_page: Create an empty text page.
- The text page is filled out by the text device to contain the blocks,
- lines and spans of text on the page.
- mediabox: optional mediabox information.
- */
- fz_stext_page *fz_new_stext_page(fz_context *ctx, const fz_rect *mediabox);
- void fz_drop_stext_page(fz_context *ctx, fz_stext_page *page);
- void fz_analyze_text(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page);
- /*
- fz_print_stext_sheet: Output a text sheet to a file as CSS.
- */
- void fz_print_stext_sheet(fz_context *ctx, fz_output *out, fz_stext_sheet *sheet);
- /*
- fz_print_stext_page_html: Output a page to a file in HTML format.
- */
- void fz_print_stext_page_html(fz_context *ctx, fz_output *out, fz_stext_page *page);
- /*
- fz_print_stext_page_xml: Output a page to a file in XML format.
- */
- void fz_print_stext_page_xml(fz_context *ctx, fz_output *out, fz_stext_page *page);
- /*
- fz_print_stext_page: Output a page to a file in UTF-8 format.
- */
- void fz_print_stext_page(fz_context *ctx, fz_output *out, fz_stext_page *page);
- /*
- fz_search_stext_page: Search for occurrence of 'needle' in text page.
- Return the number of hits and store hit bboxes in the passed in array.
- NOTE: This is an experimental interface and subject to change without notice.
- */
- int fz_search_stext_page(fz_context *ctx, fz_stext_page *text, const char *needle, fz_rect *hit_bbox, int hit_max);
- /*
- fz_highlight_selection: Return a list of rectangles to highlight given a selection rectangle.
- NOTE: This is an experimental interface and subject to change without notice.
- */
- int fz_highlight_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect, fz_rect *hit_bbox, int hit_max);
- /*
- fz_copy_selection: Return a newly allocated UTF-8 string with the text for a given selection rectangle.
- NOTE: This is an experimental interface and subject to change without notice.
- */
- char *fz_copy_selection(fz_context *ctx, fz_stext_page *page, fz_rect rect);
- /*
- struct fz_stext_options: Options for creating a pixmap and draw device.
- */
- typedef struct fz_stext_options_s fz_stext_options;
- struct fz_stext_options_s
- {
- int flags;
- };
- /*
- fz_parse_stext_options: Parse stext device options from a comma separated key-value string.
- */
- fz_stext_options *fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string);
- /*
- fz_new_stext_device: Create a device to extract the text on a page.
- Gather and sort the text on a page into spans of uniform style,
- arranged into lines and blocks by reading order. The reading order
- is determined by various heuristics, so may not be accurate.
- sheet: The text sheet to which styles should be added. This can
- either be a newly created (empty) text sheet, or one containing
- styles from a previous text device. The same sheet cannot be used
- in multiple threads simultaneously.
- page: The text page to which content should be added. This will
- usually be a newly created (empty) text page, but it can be one
- containing data already (for example when merging multiple pages,
- or watermarking).
- options: Options to configure the stext device.
- */
- fz_device *fz_new_stext_device(fz_context *ctx, fz_stext_sheet *sheet, fz_stext_page *page, const fz_stext_options *options);
- #endif
|