u5e
UnicodeTextC++Library
basic_grapheme_iterator.hpp
1 #ifndef INCLUDED_U5E_BASIC_GRAPHEME_ITERATOR
2 #define INCLUDED_U5E_BASIC_GRAPHEME_ITERATOR
3 
4 #include <u5e/basic_grapheme.hpp>
5 #include <u5e/props/grapheme_cluster_break.hpp>
6 
7 namespace u5e {
8 
9  /**
10  * \brief Iterator that describes full graphemes.
11  *
12  * \tparam UnderlyingEncodedStringView the underlying encoded string
13  * type with an underlying native string-like type.
14  */
15  template <typename UnderlyingEncodedStringView>
16  class basic_grapheme_iterator {
17  public:
18  /**
19  * The type of the underlying encoded iterator
20  */
21  typedef typename UnderlyingEncodedStringView::const_iterator
23 
24  /**
25  * the specific grapheme type for this encoded string view
26  */
27  typedef basic_grapheme<UnderlyingEncodedStringView> grapheme;
28  typedef grapheme value_type;
29 
30  //@{
31  /**
32  * The begin and end iterators for the whole text are necessary for
33  * bounds check, since the size of graphemes cannot be predicted.
34  */
37  //@}
38  //@{
39  /**
40  * This par of iterators point to where we are now and where the end
41  * of the current grapheme is.
42  */
45  //@}
46 
47  typedef props::grapheme_cluster_break::prop_value_type g_c_b_vt;
48 
49  /**
50  * The unicode standard documents that a grapheme boundary can be
51  * determined by looking just at two adjecent codepoints.
52  */
56 
57  if (va == g_c_b_vt::CR &&
58  vb == g_c_b_vt::LF) {
59  // GB3
60  return false;
61  } else if (va == g_c_b_vt::CR ||
62  va == g_c_b_vt::LF ||
63  va == g_c_b_vt::CONTROL) {
64  // GB4
65  return true;
66  } else if (vb == g_c_b_vt::CR ||
67  vb == g_c_b_vt::LF ||
68  vb == g_c_b_vt::CONTROL) {
69  // GB5
70  return true;
71  } else if (va == g_c_b_vt::L &&
72  (vb == g_c_b_vt::L ||
73  vb == g_c_b_vt::V ||
74  vb == g_c_b_vt::LV ||
75  vb == g_c_b_vt::LVT)) {
76  // GB6
77  return false;
78  } else if ((va == g_c_b_vt::LV ||
79  va == g_c_b_vt::V) &&
80  (vb == g_c_b_vt::V ||
81  vb == g_c_b_vt::T)) {
82  // GB7
83  return false;
84  } else if ((va == g_c_b_vt::LVT ||
85  va == g_c_b_vt::T) &&
86  vb == g_c_b_vt::T) {
87  // GB8
88  return false;
89  } else if (vb == g_c_b_vt::EXTEND ||
90  vb == g_c_b_vt::ZWJ) {
91  // GB9
92  return false;
93  } else if (vb == g_c_b_vt::SPACINGMARK) {
94  // GB9a
95  return false;
96  } else if (va == g_c_b_vt::PREPEND) {
97  // GB9b
98  return false;
99  } else if ( ( (va == g_c_b_vt::E_BASE ||
100  va == g_c_b_vt::E_BASE_GAZ) &&
101  vb == g_c_b_vt::E_MODIFIER) ||
102  ( va == g_c_b_vt::EXTEND &&
103  vb == g_c_b_vt::E_MODIFIER )) {
104  // GB10 -- that is the interpretation I can make
105  // of the combination of the fact that you should be able
106  // to compare only two adjancent characters and the text of
107  // the standard.
108  return false;
109  } else if (va == g_c_b_vt::ZWJ &&
110  (vb == g_c_b_vt::GLUE_AFTER_ZWJ ||
111  vb == g_c_b_vt::E_BASE_GAZ)) {
112  // GB11
113  return false;
114  } else if (va == g_c_b_vt::REGIONAL_INDICATOR &&
115  vb == g_c_b_vt::REGIONAL_INDICATOR) {
116  // GB12, GB13
117  // again, I take the liberty to assume the earlier part of the text
118  // that says you only need to look at two adjacent characters
119  return false;
120  } else {
121  // GB999
122  return true;
123  }
124  }
125 
126  //@{
127  /**
128  * Use the data from the unicode database to find the start and
129  * end of the current grapheme.
130  */
132  // GB2
133  if (end_of_grapheme_ == end_)
134  return;
135  // advance end_of_grapheme_ until it's no longer in the same grapheme
136 
137  // GB1
138  // this always start as where_ == end_of_grapheme_;
141 
142  while (1) {
143  // GB2
144  if (end_of_grapheme_ == end_)
145  return;
147 
148  if (is_grapheme_boundary(a, b)) {
149  return;
150  }
151 
152  a = b;
154  }
155  }
156 
158  // GB2
159  if (where_ == begin_)
160  return;
161  // rewind where_ until it's no longer in the same grapheme
162 
163  // GB1
164  // this always start as copy = where_
166  --copy;
167  codepoint a = *copy;
168 
169  while (1) {
170  if (where_ == begin_)
171  return;
172  codepoint b = *where_;
173 
174  if (is_grapheme_boundary(a, b)) {
175  return;
176  }
177 
178  a = b;
179  --where_;
180  }
181  }
182  //@}
183 
184  /**
185  * \brief start at the beginning of the text
186  */
189  :begin_(b), end_(e), where_(b), end_of_grapheme_(b) {
191  };
192 
193  /**
194  * \brief start at a specific point
195  * find the start and the end of the grapheme
196  */
200  :begin_(b), end_(e), where_(w), end_of_grapheme_(w) {
203  };
204 
205  /**
206  * \brief start at a specific point - precalculated
207  * start and end of grapheme
208  */
213  :begin_(b), end_(e), where_(w), end_of_grapheme_(we) {
214  };
215 
216  /**
217  * \brief copy constructor
218  */
219  basic_grapheme_iterator(const basic_grapheme_iterator& copy)
220  :begin_(copy.begin_), end_(copy.end_),
221  where_(copy.where_), end_of_grapheme_(copy.end_of_grapheme_) {}
222 
223  /**
224  * dereference to a grapheme object
225  */
228  }
229 
230  //@{
231  /**
232  * advance one grapheme
233  */
234  basic_grapheme_iterator operator++() {
237  return *this;
238  }
239 
240  basic_grapheme_iterator operator++(int i) {
241  basic_grapheme_iterator copy(*this);
242  ++(*this);
243  return copy;
244  }
245  //@}
246 
247  /**
248  * delegate the comparison to the underlying iterator
249  */
251  if (where_ == other) {
252  return true;
253  } else {
254  for (const_codepoint_iterator copy = where_;
255  copy != end_of_grapheme_; copy++) {
256  if (copy == other) {
257  return true;
258  }
259  }
260  return false;
261  }
262  }
263 
264  /**
265  * delegate the comparison to the underlying iterator
266  */
267  bool operator==(basic_grapheme_iterator other) {
268  if (where_ == end_ &&
269  other == end_) {
270  return true;
271  } else {
272  for (const_codepoint_iterator copy = where_;
273  copy != end_of_grapheme_; copy++) {
274  if (other == copy) {
275  return true;
276  }
277  }
278  return false;
279  }
280  }
281 
282  /**
283  * delegate the comparison to the underlying iterator
284  */
285  bool operator!=(basic_grapheme_iterator other) {
286  return !(*this == other);
287  }
288 
289  /**
290  * delegate the comparison to the underlying iterator
291  */
293  {
294  return !(*this == other);
295  }
296  };
297 };
298 
299 #endif
Grapheme Cluster Break property for a codepoint.
basic_grapheme_iterator(const_codepoint_iterator b, const_codepoint_iterator e, const_codepoint_iterator w)
start at a specific point find the start and the end of the grapheme
bool is_grapheme_boundary(codepoint a, codepoint b)
bool operator!=(const_codepoint_iterator other)
const_codepoint_iterator end_of_grapheme_
basic_grapheme_iterator(const_codepoint_iterator b, const_codepoint_iterator e)
start at the beginning of the text
main u5e namespace
Native representation of a codepoint.
Definition: codepoint.hpp:15
bool operator==(const_codepoint_iterator other)
basic_grapheme_iterator(const_codepoint_iterator b, const_codepoint_iterator e, const_codepoint_iterator w, const_codepoint_iterator we)
start at a specific point - precalculated start and end of grapheme
constexpr codepoint & operator=(const codepoint &x)=default
basic_grapheme_iterator(const basic_grapheme_iterator &copy)
copy constructor
static prop_value_type resolve(codepoint c)
bool operator==(basic_grapheme_iterator other)
basic_grapheme_iterator operator++()
basic_grapheme< UnderlyingEncodedStringView > grapheme
bool operator!=(basic_grapheme_iterator other)
UnderlyingEncodedStringView::const_iterator const_codepoint_iterator
Represents a single grapheme cluster.
codepoint property handling
basic_grapheme_iterator operator++(int i)