u5e
UnicodeTextC++Library
normalization_form_kc.hpp
1 #ifndef INCLUDED_U5E_NORMALIZATION_FORM_KC
2 #define INCLUDED_U5E_NORMALIZATION_FORM_KC
3 
4 #include <algorithm>
5 #include <u5e/basic_grapheme.hpp>
6 #include <u5e/utf32ne_string.hpp>
7 #include <u5e/filter.hpp>
8 #include <u5e/compatibility_and_canonical_decomposition.hpp>
9 #include <u5e/canonical_combining_order.hpp>
10 #include <u5e/canonical_composition.hpp>
11 
12 namespace u5e {
13  /**
14  * \brief u5e::filter algorithm for normalizing graphemes
15  *
16  * This will work by reading an input grapheme iterator and,
17  * grapheme by grapheme normalize them in form KC.
18  *
19  * This will use the unicode database to search for equivalent
20  * codepoint sequences.
21  */
22  template <typename InputStorageType,
23  typename OutputStorageType = InputStorageType>
24  inline int normalization_form_kc(basic_grapheme<InputStorageType> grapheme,
25  OutputStorageType& output) {
26 
27  // first step is to decompose the grapheme
28  utf32ne_string decomposed;
29  int count = u5e::filter
30  (grapheme.codepoint_begin(),
31  grapheme.codepoint_end(),
32  decomposed,
33  compatibility_and_canonical_decomposition<utf32ne_string>);
34 
35  // then sort based on canonical combining class
36  std::sort(decomposed.codepoint_begin(), decomposed.codepoint_end(),
38 
39  // finally recompose. we will do that in-place on the decomposed
40  // string, since we never have to look back.
41  int compositions = 0;
42  utf32ne_string::iterator oi_begin(decomposed.codepoint_begin());
43  utf32ne_string::iterator oi
44  (u5e::canonical_composition(decomposed,&compositions));
45 
46  // finally append the output
47  output.template append_from_utf32ne<utf32ne_string>
48  (oi_begin, oi);
49 
50  // we re-use the counter from the decomposition filter and
51  // subtract how many pair were composed into a single codepoint.
52  return count - compositions;
53  }
54 }
55 
56 #endif
main u5e namespace
basic_encodedstring< utf32ne, std::basic_string< int > > utf32ne_string
A basic_encodedstring of utf32ne and std::basic_string<int>
int normalization_form_kc(basic_grapheme< InputStorageType > grapheme, OutputStorageType &output)
u5e::filter algorithm for normalizing graphemes
StorageType::iterator canonical_composition(StorageType &data, int *count)
performs in-place canonical composition.
bool canonical_combining_order(int a, int b)
compare codepoints according to the canonical combining order
Represents a single grapheme cluster.