u5e
UnicodeTextC++Library
normalization_form_c.hpp
1 #ifndef INCLUDED_U5E_NORMALIZATION_FORM_C
2 #define INCLUDED_U5E_NORMALIZATION_FORM_C
3 
4 #include <algorithm>
5 #include <u5e/basic_grapheme.hpp>
6 #include <u5e/utf32ne_string.hpp>
7 #include <u5e/filter.hpp>
8 #include <u5e/canonical_decomposition.hpp>
9 #include <u5e/canonical_combining_order.hpp>
10 #include <u5e/canonical_composition.hpp>
11 
12 namespace u5e {
13  /**
14  * \brief u5e::filter algorithm for normalizing graphemes
15  *
16  * This will work by reading an input grapheme iterator and,
17  * grapheme by grapheme normalize them in form C.
18  *
19  * This will use the unicode database to search for equivalent
20  * codepoint sequences.
21  */
22  template <typename InputStorageType,
23  typename OutputStorageType = InputStorageType>
24  inline int normalization_form_c(basic_grapheme<InputStorageType> grapheme,
25  OutputStorageType& output) {
26 
27  // first step is to decompose the grapheme
28  utf32ne_string decomposed;
29  int count = u5e::filter(grapheme.codepoint_begin(),
30  grapheme.codepoint_end(),
31  decomposed,
32  canonical_decomposition<utf32ne_string>);
33 
34  // then sort based on canonical combining class
35  std::sort(decomposed.codepoint_begin(), decomposed.codepoint_end(),
37 
38  // finally recompose. we will do that in-place on the decomposed
39  // string, since we never have to look back.
40  int compositions = 0;
41  utf32ne_string::iterator oi_begin(decomposed.codepoint_begin());
42  utf32ne_string::iterator oi
43  (u5e::canonical_composition(decomposed,&compositions));
44 
45  // finally append the output
46  output.template append_from_utf32ne<utf32ne_string>
47  (oi_begin, oi);
48 
49  // we re-use the counter from the decomposition filter and
50  // subtract how many pair were composed into a single codepoint.
51  return count - compositions;
52  }
53 }
54 
55 #endif
main u5e namespace
int normalization_form_c(basic_grapheme< InputStorageType > grapheme, OutputStorageType &output)
u5e::filter algorithm for normalizing graphemes
basic_encodedstring< utf32ne, std::basic_string< int > > utf32ne_string
A basic_encodedstring of utf32ne and std::basic_string<int>
StorageType::iterator canonical_composition(StorageType &data, int *count)
performs in-place canonical composition.
bool canonical_combining_order(int a, int b)
compare codepoints according to the canonical combining order
Represents a single grapheme cluster.