ucdn.h 13 KB


  1. /*
  2. * Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net>
  3. *
  4. * Permission to use, copy, modify, and/or distribute this software for any
  5. * purpose with or without fee is hereby granted, provided that the above
  6. * copyright notice and this permission notice appear in all copies.
  7. *
  8. * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  9. * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  10. * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  11. * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  12. * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  13. * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  14. * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  15. */
  16. #ifndef UCDN_H
  17. #define UCDN_H
  18. #ifdef __cplusplus
  19. extern "C" {
  20. #endif
  21. #define UCDN_EAST_ASIAN_F 0
  22. #define UCDN_EAST_ASIAN_H 1
  23. #define UCDN_EAST_ASIAN_W 2
  24. #define UCDN_EAST_ASIAN_NA 3
  25. #define UCDN_EAST_ASIAN_A 4
  26. #define UCDN_EAST_ASIAN_N 5
  27. #define UCDN_SCRIPT_COMMON 0
  28. #define UCDN_SCRIPT_LATIN 1
  29. #define UCDN_SCRIPT_GREEK 2
  30. #define UCDN_SCRIPT_CYRILLIC 3
  31. #define UCDN_SCRIPT_ARMENIAN 4
  32. #define UCDN_SCRIPT_HEBREW 5
  33. #define UCDN_SCRIPT_ARABIC 6
  34. #define UCDN_SCRIPT_SYRIAC 7
  35. #define UCDN_SCRIPT_THAANA 8
  36. #define UCDN_SCRIPT_DEVANAGARI 9
  37. #define UCDN_SCRIPT_BENGALI 10
  38. #define UCDN_SCRIPT_GURMUKHI 11
  39. #define UCDN_SCRIPT_GUJARATI 12
  40. #define UCDN_SCRIPT_ORIYA 13
  41. #define UCDN_SCRIPT_TAMIL 14
  42. #define UCDN_SCRIPT_TELUGU 15
  43. #define UCDN_SCRIPT_KANNADA 16
  44. #define UCDN_SCRIPT_MALAYALAM 17
  45. #define UCDN_SCRIPT_SINHALA 18
  46. #define UCDN_SCRIPT_THAI 19
  47. #define UCDN_SCRIPT_LAO 20
  48. #define UCDN_SCRIPT_TIBETAN 21
  49. #define UCDN_SCRIPT_MYANMAR 22
  50. #define UCDN_SCRIPT_GEORGIAN 23
  51. #define UCDN_SCRIPT_HANGUL 24
  52. #define UCDN_SCRIPT_ETHIOPIC 25
  53. #define UCDN_SCRIPT_CHEROKEE 26
  54. #define UCDN_SCRIPT_CANADIAN_ABORIGINAL 27
  55. #define UCDN_SCRIPT_OGHAM 28
  56. #define UCDN_SCRIPT_RUNIC 29
  57. #define UCDN_SCRIPT_KHMER 30
  58. #define UCDN_SCRIPT_MONGOLIAN 31
  59. #define UCDN_SCRIPT_HIRAGANA 32
  60. #define UCDN_SCRIPT_KATAKANA 33
  61. #define UCDN_SCRIPT_BOPOMOFO 34
  62. #define UCDN_SCRIPT_HAN 35
  63. #define UCDN_SCRIPT_YI 36
  64. #define UCDN_SCRIPT_OLD_ITALIC 37
  65. #define UCDN_SCRIPT_GOTHIC 38
  66. #define UCDN_SCRIPT_DESERET 39
  67. #define UCDN_SCRIPT_INHERITED 40
  68. #define UCDN_SCRIPT_TAGALOG 41
  69. #define UCDN_SCRIPT_HANUNOO 42
  70. #define UCDN_SCRIPT_BUHID 43
  71. #define UCDN_SCRIPT_TAGBANWA 44
  72. #define UCDN_SCRIPT_LIMBU 45
  73. #define UCDN_SCRIPT_TAI_LE 46
  74. #define UCDN_SCRIPT_LINEAR_B 47
  75. #define UCDN_SCRIPT_UGARITIC 48
  76. #define UCDN_SCRIPT_SHAVIAN 49
  77. #define UCDN_SCRIPT_OSMANYA 50
  78. #define UCDN_SCRIPT_CYPRIOT 51
  79. #define UCDN_SCRIPT_BRAILLE 52
  80. #define UCDN_SCRIPT_BUGINESE 53
  81. #define UCDN_SCRIPT_COPTIC 54
  82. #define UCDN_SCRIPT_NEW_TAI_LUE 55
  83. #define UCDN_SCRIPT_GLAGOLITIC 56
  84. #define UCDN_SCRIPT_TIFINAGH 57
  85. #define UCDN_SCRIPT_SYLOTI_NAGRI 58
  86. #define UCDN_SCRIPT_OLD_PERSIAN 59
  87. #define UCDN_SCRIPT_KHAROSHTHI 60
  88. #define UCDN_SCRIPT_BALINESE 61
  89. #define UCDN_SCRIPT_CUNEIFORM 62
  90. #define UCDN_SCRIPT_PHOENICIAN 63
  91. #define UCDN_SCRIPT_PHAGS_PA 64
  92. #define UCDN_SCRIPT_NKO 65
  93. #define UCDN_SCRIPT_SUNDANESE 66
  94. #define UCDN_SCRIPT_LEPCHA 67
  95. #define UCDN_SCRIPT_OL_CHIKI 68
  96. #define UCDN_SCRIPT_VAI 69
  97. #define UCDN_SCRIPT_SAURASHTRA 70
  98. #define UCDN_SCRIPT_KAYAH_LI 71
  99. #define UCDN_SCRIPT_REJANG 72
  100. #define UCDN_SCRIPT_LYCIAN 73
  101. #define UCDN_SCRIPT_CARIAN 74
  102. #define UCDN_SCRIPT_LYDIAN 75
  103. #define UCDN_SCRIPT_CHAM 76
  104. #define UCDN_SCRIPT_TAI_THAM 77
  105. #define UCDN_SCRIPT_TAI_VIET 78
  106. #define UCDN_SCRIPT_AVESTAN 79
  107. #define UCDN_SCRIPT_EGYPTIAN_HIEROGLYPHS 80
  108. #define UCDN_SCRIPT_SAMARITAN 81
  109. #define UCDN_SCRIPT_LISU 82
  110. #define UCDN_SCRIPT_BAMUM 83
  111. #define UCDN_SCRIPT_JAVANESE 84
  112. #define UCDN_SCRIPT_MEETEI_MAYEK 85
  113. #define UCDN_SCRIPT_IMPERIAL_ARAMAIC 86
  114. #define UCDN_SCRIPT_OLD_SOUTH_ARABIAN 87
  115. #define UCDN_SCRIPT_INSCRIPTIONAL_PARTHIAN 88
  116. #define UCDN_SCRIPT_INSCRIPTIONAL_PAHLAVI 89
  117. #define UCDN_SCRIPT_OLD_TURKIC 90
  118. #define UCDN_SCRIPT_KAITHI 91
  119. #define UCDN_SCRIPT_BATAK 92
  120. #define UCDN_SCRIPT_BRAHMI 93
  121. #define UCDN_SCRIPT_MANDAIC 94
  122. #define UCDN_SCRIPT_CHAKMA 95
  123. #define UCDN_SCRIPT_MEROITIC_CURSIVE 96
  124. #define UCDN_SCRIPT_MEROITIC_HIEROGLYPHS 97
  125. #define UCDN_SCRIPT_MIAO 98
  126. #define UCDN_SCRIPT_SHARADA 99
  127. #define UCDN_SCRIPT_SORA_SOMPENG 100
  128. #define UCDN_SCRIPT_TAKRI 101
  129. #define UCDN_SCRIPT_UNKNOWN 102
  130. #define UCDN_SCRIPT_BASSA_VAH 103
  131. #define UCDN_SCRIPT_CAUCASIAN_ALBANIAN 104
  132. #define UCDN_SCRIPT_DUPLOYAN 105
  133. #define UCDN_SCRIPT_ELBASAN 106
  134. #define UCDN_SCRIPT_GRANTHA 107
  135. #define UCDN_SCRIPT_KHOJKI 108
  136. #define UCDN_SCRIPT_KHUDAWADI 109
  137. #define UCDN_SCRIPT_LINEAR_A 110
  138. #define UCDN_SCRIPT_MAHAJANI 111
  139. #define UCDN_SCRIPT_MANICHAEAN 112
  140. #define UCDN_SCRIPT_MENDE_KIKAKUI 113
  141. #define UCDN_SCRIPT_MODI 114
  142. #define UCDN_SCRIPT_MRO 115
  143. #define UCDN_SCRIPT_NABATAEAN 116
  144. #define UCDN_SCRIPT_OLD_NORTH_ARABIAN 117
  145. #define UCDN_SCRIPT_OLD_PERMIC 118
  146. #define UCDN_SCRIPT_PAHAWH_HMONG 119
  147. #define UCDN_SCRIPT_PALMYRENE 120
  148. #define UCDN_SCRIPT_PAU_CIN_HAU 121
  149. #define UCDN_SCRIPT_PSALTER_PAHLAVI 122
  150. #define UCDN_SCRIPT_SIDDHAM 123
  151. #define UCDN_SCRIPT_TIRHUTA 124
  152. #define UCDN_SCRIPT_WARANG_CITI 125
  153. #define UCDN_SCRIPT_AHOM 126
  154. #define UCDN_SCRIPT_ANATOLIAN_HIEROGLYPHS 127
  155. #define UCDN_SCRIPT_HATRAN 128
  156. #define UCDN_SCRIPT_MULTANI 129
  157. #define UCDN_SCRIPT_OLD_HUNGARIAN 130
  158. #define UCDN_SCRIPT_SIGNWRITING 131
  159. #define UCDN_SCRIPT_ADLAM 132
  160. #define UCDN_SCRIPT_BHAIKSUKI 133
  161. #define UCDN_SCRIPT_MARCHEN 134
  162. #define UCDN_SCRIPT_NEWA 135
  163. #define UCDN_SCRIPT_OSAGE 136
  164. #define UCDN_SCRIPT_TANGUT 137
  165. #define UCDN_LAST_SCRIPT 137
  166. #define UCDN_LINEBREAK_CLASS_OP 0
  167. #define UCDN_LINEBREAK_CLASS_CL 1
  168. #define UCDN_LINEBREAK_CLASS_CP 2
  169. #define UCDN_LINEBREAK_CLASS_QU 3
  170. #define UCDN_LINEBREAK_CLASS_GL 4
  171. #define UCDN_LINEBREAK_CLASS_NS 5
  172. #define UCDN_LINEBREAK_CLASS_EX 6
  173. #define UCDN_LINEBREAK_CLASS_SY 7
  174. #define UCDN_LINEBREAK_CLASS_IS 8
  175. #define UCDN_LINEBREAK_CLASS_PR 9
  176. #define UCDN_LINEBREAK_CLASS_PO 10
  177. #define UCDN_LINEBREAK_CLASS_NU 11
  178. #define UCDN_LINEBREAK_CLASS_AL 12
  179. #define UCDN_LINEBREAK_CLASS_HL 13
  180. #define UCDN_LINEBREAK_CLASS_ID 14
  181. #define UCDN_LINEBREAK_CLASS_IN 15
  182. #define UCDN_LINEBREAK_CLASS_HY 16
  183. #define UCDN_LINEBREAK_CLASS_BA 17
  184. #define UCDN_LINEBREAK_CLASS_BB 18
  185. #define UCDN_LINEBREAK_CLASS_B2 19
  186. #define UCDN_LINEBREAK_CLASS_ZW 20
  187. #define UCDN_LINEBREAK_CLASS_CM 21
  188. #define UCDN_LINEBREAK_CLASS_WJ 22
  189. #define UCDN_LINEBREAK_CLASS_H2 23
  190. #define UCDN_LINEBREAK_CLASS_H3 24
  191. #define UCDN_LINEBREAK_CLASS_JL 25
  192. #define UCDN_LINEBREAK_CLASS_JV 26
  193. #define UCDN_LINEBREAK_CLASS_JT 27
  194. #define UCDN_LINEBREAK_CLASS_RI 28
  195. #define UCDN_LINEBREAK_CLASS_AI 29
  196. #define UCDN_LINEBREAK_CLASS_BK 30
  197. #define UCDN_LINEBREAK_CLASS_CB 31
  198. #define UCDN_LINEBREAK_CLASS_CJ 32
  199. #define UCDN_LINEBREAK_CLASS_CR 33
  200. #define UCDN_LINEBREAK_CLASS_LF 34
  201. #define UCDN_LINEBREAK_CLASS_NL 35
  202. #define UCDN_LINEBREAK_CLASS_SA 36
  203. #define UCDN_LINEBREAK_CLASS_SG 37
  204. #define UCDN_LINEBREAK_CLASS_SP 38
  205. #define UCDN_LINEBREAK_CLASS_XX 39
  206. #define UCDN_GENERAL_CATEGORY_CC 0
  207. #define UCDN_GENERAL_CATEGORY_CF 1
  208. #define UCDN_GENERAL_CATEGORY_CN 2
  209. #define UCDN_GENERAL_CATEGORY_CO 3
  210. #define UCDN_GENERAL_CATEGORY_CS 4
  211. #define UCDN_GENERAL_CATEGORY_LL 5
  212. #define UCDN_GENERAL_CATEGORY_LM 6
  213. #define UCDN_GENERAL_CATEGORY_LO 7
  214. #define UCDN_GENERAL_CATEGORY_LT 8
  215. #define UCDN_GENERAL_CATEGORY_LU 9
  216. #define UCDN_GENERAL_CATEGORY_MC 10
  217. #define UCDN_GENERAL_CATEGORY_ME 11
  218. #define UCDN_GENERAL_CATEGORY_MN 12
  219. #define UCDN_GENERAL_CATEGORY_ND 13
  220. #define UCDN_GENERAL_CATEGORY_NL 14
  221. #define UCDN_GENERAL_CATEGORY_NO 15
  222. #define UCDN_GENERAL_CATEGORY_PC 16
  223. #define UCDN_GENERAL_CATEGORY_PD 17
  224. #define UCDN_GENERAL_CATEGORY_PE 18
  225. #define UCDN_GENERAL_CATEGORY_PF 19
  226. #define UCDN_GENERAL_CATEGORY_PI 20
  227. #define UCDN_GENERAL_CATEGORY_PO 21
  228. #define UCDN_GENERAL_CATEGORY_PS 22
  229. #define UCDN_GENERAL_CATEGORY_SC 23
  230. #define UCDN_GENERAL_CATEGORY_SK 24
  231. #define UCDN_GENERAL_CATEGORY_SM 25
  232. #define UCDN_GENERAL_CATEGORY_SO 26
  233. #define UCDN_GENERAL_CATEGORY_ZL 27
  234. #define UCDN_GENERAL_CATEGORY_ZP 28
  235. #define UCDN_GENERAL_CATEGORY_ZS 29
  236. #define UCDN_BIDI_CLASS_L 0
  237. #define UCDN_BIDI_CLASS_LRE 1
  238. #define UCDN_BIDI_CLASS_LRO 2
  239. #define UCDN_BIDI_CLASS_R 3
  240. #define UCDN_BIDI_CLASS_AL 4
  241. #define UCDN_BIDI_CLASS_RLE 5
  242. #define UCDN_BIDI_CLASS_RLO 6
  243. #define UCDN_BIDI_CLASS_PDF 7
  244. #define UCDN_BIDI_CLASS_EN 8
  245. #define UCDN_BIDI_CLASS_ES 9
  246. #define UCDN_BIDI_CLASS_ET 10
  247. #define UCDN_BIDI_CLASS_AN 11
  248. #define UCDN_BIDI_CLASS_CS 12
  249. #define UCDN_BIDI_CLASS_NSM 13
  250. #define UCDN_BIDI_CLASS_BN 14
  251. #define UCDN_BIDI_CLASS_B 15
  252. #define UCDN_BIDI_CLASS_S 16
  253. #define UCDN_BIDI_CLASS_WS 17
  254. #define UCDN_BIDI_CLASS_ON 18
  255. #define UCDN_BIDI_CLASS_LRI 19
  256. #define UCDN_BIDI_CLASS_RLI 20
  257. #define UCDN_BIDI_CLASS_FSI 21
  258. #define UCDN_BIDI_CLASS_PDI 22
  259. #define UCDN_BIDI_PAIRED_BRACKET_TYPE_OPEN 0
  260. #define UCDN_BIDI_PAIRED_BRACKET_TYPE_CLOSE 1
  261. #define UCDN_BIDI_PAIRED_BRACKET_TYPE_NONE 2
  262. /**
  263. * Return version of the Unicode database.
  264. *
  265. * @return Unicode database version
  266. */
  267. const char *ucdn_get_unicode_version(void);
  268. /**
  269. * Get combining class of a codepoint.
  270. *
  271. * @param code Unicode codepoint
  272. * @return combining class value, as defined in UAX#44
  273. */
  274. int ucdn_get_combining_class(uint32_t code);
  275. /**
  276. * Get east-asian width of a codepoint.
  277. *
  278. * @param code Unicode codepoint
  279. * @return value according to UCDN_EAST_ASIAN_* and as defined in UAX#11.
  280. */
  281. int ucdn_get_east_asian_width(uint32_t code);
  282. /**
  283. * Get general category of a codepoint.
  284. *
  285. * @param code Unicode codepoint
  286. * @return value according to UCDN_GENERAL_CATEGORY_* and as defined in
  287. * UAX#44.
  288. */
  289. int ucdn_get_general_category(uint32_t code);
  290. /**
  291. * Get bidirectional class of a codepoint.
  292. *
  293. * @param code Unicode codepoint
  294. * @return value according to UCDN_BIDI_CLASS_* and as defined in UAX#44.
  295. */
  296. int ucdn_get_bidi_class(uint32_t code);
  297. /**
  298. * Get script of a codepoint.
  299. *
  300. * @param code Unicode codepoint
  301. * @return value according to UCDN_SCRIPT_* and as defined in UAX#24.
  302. */
  303. int ucdn_get_script(uint32_t code);
  304. /**
  305. * Get unresolved linebreak class of a codepoint. This does not take
  306. * rule LB1 of UAX#14 into account. See ucdn_get_resolved_linebreak_class()
  307. * for resolved linebreak classes.
  308. *
  309. * @param code Unicode codepoint
  310. * @return value according to UCDN_LINEBREAK_* and as defined in UAX#14.
  311. */
  312. int ucdn_get_linebreak_class(uint32_t code);
  313. /**
  314. * Get resolved linebreak class of a codepoint. This resolves characters
  315. * in the AI, SG, XX, SA and CJ classes according to rule LB1 of UAX#14.
  316. * In addition the CB class is resolved as the equivalent B2 class and
  317. * the NL class is resolved as the equivalent BK class.
  318. *
  319. * @param code Unicode codepoint
  320. * @return value according to UCDN_LINEBREAK_* and as defined in UAX#14.
  321. */
  322. int ucdn_get_resolved_linebreak_class(uint32_t code);
  323. /**
  324. * Check if codepoint can be mirrored.
  325. *
  326. * @param code Unicode codepoint
  327. * @return 1 if mirrored character exists, otherwise 0
  328. */
  329. int ucdn_get_mirrored(uint32_t code);
  330. /**
  331. * Mirror a codepoint.
  332. *
  333. * @param code Unicode codepoint
  334. * @return mirrored codepoint or the original codepoint if no
  335. * mirrored character exists
  336. */
  337. uint32_t ucdn_mirror(uint32_t code);
  338. /**
  339. * Get paired bracket for a codepoint.
  340. *
  341. * @param code Unicode codepoint
  342. * @return paired bracket codepoint or the original codepoint if no
  343. * paired bracket character exists
  344. */
  345. uint32_t ucdn_paired_bracket(uint32_t code);
  346. /**
  347. * Get paired bracket type for a codepoint.
  348. *
  349. * @param code Unicode codepoint
  350. * @return value according to UCDN_BIDI_PAIRED_BRACKET_TYPE_* and as defined
  351. * in UAX#9.
  352. *
  353. */
  354. int ucdn_paired_bracket_type(uint32_t code);
  355. /**
  356. * Pairwise canonical decomposition of a codepoint. This includes
  357. * Hangul Jamo decomposition (see chapter 3.12 of the Unicode core
  358. * specification).
  359. *
  360. * Hangul is decomposed into L and V jamos for LV forms, and an
  361. * LV precomposed syllable and a T jamo for LVT forms.
  362. *
  363. * @param code Unicode codepoint
  364. * @param a filled with first codepoint of decomposition
  365. * @param b filled with second codepoint of decomposition, or 0
  366. * @return success
  367. */
  368. int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b);
  369. /**
  370. * Compatibility decomposition of a codepoint.
  371. *
  372. * @param code Unicode codepoint
  373. * @param decomposed filled with decomposition, must be able to hold 18
  374. * characters
  375. * @return length of decomposition or 0 in case none exists
  376. */
  377. int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed);
  378. /**
  379. * Pairwise canonical composition of two codepoints. This includes
  380. * Hangul Jamo composition (see chapter 3.12 of the Unicode core
  381. * specification).
  382. *
  383. * Hangul composition expects either L and V jamos, or an LV
  384. * precomposed syllable and a T jamo. This is exactly the inverse
  385. * of pairwise Hangul decomposition.
  386. *
  387. * @param code filled with composition
  388. * @param a first codepoint
  389. * @param b second codepoint
  390. * @return success
  391. */
  392. int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b);
  393. #ifdef __cplusplus
  394. }
  395. #endif
  396. #endif