u5e
UnicodeTextC++Library
normalization_form_kd.hpp
1 #ifndef INCLUDED_U5E_NORMALIZATION_FORM_KD
2 #define INCLUDED_U5E_NORMALIZATION_FORM_KD
3 
4 #include <algorithm>
5 #include <u5e/basic_grapheme.hpp>
6 #include <u5e/utf32ne_string.hpp>
7 #include <u5e/filter.hpp>
8 #include <u5e/compatibility_and_canonical_decomposition.hpp>
9 #include <u5e/canonical_combining_order.hpp>
10 
11 namespace u5e {
12  /**
13  * \brief u5e::filter algorithm for normalizing graphemes
14  *
15  * This will work by reading an input grapheme iterator and,
16  * grapheme by grapheme normalize them in form KD.
17  *
18  * This will use the unicode database to search for equivalent
19  * codepoint sequences.
20  */
21  template <typename InputStorageType,
22  typename OutputStorageType = InputStorageType>
23  inline int normalization_form_kd(basic_grapheme<InputStorageType> grapheme,
24  OutputStorageType& output) {
25 
26  // first step is to decompose the grapheme
27  utf32ne_string decomposed;
28  int count = u5e::filter
29  (grapheme.codepoint_begin(),
30  grapheme.codepoint_end(),
31  decomposed,
32  compatibility_and_canonical_decomposition<utf32ne_string>);
33 
34  // then sort based on canonical combining class
35  std::sort(decomposed.codepoint_begin(), decomposed.codepoint_end(),
37 
38  // finally append the output
39  output.template append_from_utf32ne<utf32ne_string>
40  (decomposed.codepoint_begin(),
41  decomposed.codepoint_end());
42 
43  // we re-use the counter from the decomposition filter.
44  return count;
45  }
46 }
47 
48 #endif
main u5e namespace
basic_encodedstring< utf32ne, std::basic_string< int > > utf32ne_string
A basic_encodedstring of utf32ne and std::basic_string<int>
bool canonical_combining_order(int a, int b)
compare codepoints according to the canonical combining order
int normalization_form_kd(basic_grapheme< InputStorageType > grapheme, OutputStorageType &output)
u5e::filter algorithm for normalizing graphemes
Represents a single grapheme cluster.