u5e
UnicodeTextC++Library
utf8_iterator.hpp
1 #ifndef INCLUDED_U5E_UTF8_ITERATOR
2 #define INCLUDED_U5E_UTF8_ITERATOR
3 
4 #include <cmath>
5 #include <iterator>
6 #include <u5e/codepoint.hpp>
7 #include <u5e/iterator_assertion.hpp>
8 #include <u5e/utf8_util.hpp>
9 
10 namespace u5e {
11  /**
12  * \brief Defines the basic inner workings of utf8 iterator
13  *
14  * \tparam NativeIterator The underlying type to be iterated over.
15  */
16  template <typename NativeIterator>
18  public:
19  /**
20  * The NativeIterator must match the attributes of char
21  */
22  iterator_assertion<NativeIterator, char> _assertions;
23  /**
24  * This class composes over the NativeIterator
25  */
26  NativeIterator raw_iterator_;
27 
28  //@{
29  /**
30  * Basic iterator typedefs
31  */
33  typedef const codepoint& reference;
34  typedef int difference_type;
35  typedef std::bidirectional_iterator_tag iterator_category;
36  //@}
37 
38  /**
39  * Create a iterator from the underlying iterator
40  */
41  inline utf8_iterator_base(const NativeIterator raw_iterator)
42  : raw_iterator_(raw_iterator) {
43  };
44 
45  /**
46  * When doing a reverse itetor, you need to be able to find
47  * where the current codepoint started.
48  */
49  inline bool rewind_to_start_of_codepoint(const char current_octet) {
50  // when we do '*it = codepoint', we will leave the iterator
51  // halfway into the next character
52  bool ret = false;
54  raw_iterator_--;
55  ret = true;
56  }
57  return ret;
58  }
59 
60  /**
61  * Advance the iterator to the next codepoint
62  */
63  inline void forward_one_codepoint() {
66  std::advance(raw_iterator_, size);
67  }
68 
69  /**
70  * Go to the previous codepoint.
71  */
72  inline void rewind_one_codepoint() {
74  raw_iterator_--;
76  raw_iterator_--;
77  }
78  }
79 
80  /**
81  * Return the codepoint that starts where we are now
82  */
84  char first_octet = *raw_iterator_;
85  if (utf8_util::is_7bit_character(first_octet)) {
86  return first_octet;
87  } else {
88  if (rewind_to_start_of_codepoint(first_octet)) {
89  first_octet = *raw_iterator_;
90  }
91  NativeIterator copy_ = raw_iterator_;
92  difference_type size =
93  utf8_util::codepoint_size(first_octet);
94  unsigned char mask_first_octet = ~(0xFF<<(7-size));
95  int value = (first_octet & mask_first_octet);
96  while (--size) {
97  value = value<<6 | (*(++copy_) & 0b00111111);
98  }
99  return value;
100  }
101  }
102 
103  };
104 
105  /**
106  * \brief const iterator for utf8 encoded strings.
107  * \tparam NativeIterator The underlying type to be iterated over.
108  */
109  template <typename NativeIterator>
111  : public utf8_iterator_base<NativeIterator> {
112  public:
113  /**
114  * Offers itself as the pointer type
115  */
117 
118  /**
119  * Create from the underlying iterator type
120  */
121  inline utf8_const_iterator(const NativeIterator raw_iterator)
122  : utf8_iterator_base<NativeIterator>(raw_iterator) { };
123 
124  /**
125  * Copy constructor
126  */
128  : utf8_iterator_base<NativeIterator>(tocopy.raw_iterator_) { };
129 
130  //@{
131  /**
132  * Advance the iterator
133  */
135  this->forward_one_codepoint();
136  return *this;
137  }
138 
139  inline utf8_const_iterator operator++(int junk) {
140  utf8_const_iterator copy(this->raw_iterator_);
141  ++(*this);
142  return copy;
143  }
144  //@}
145 
146  //@{
147  /**
148  * Rewinds the iterator
149  */
151  this->rewind_one_codepoint();
152  return *this;
153  }
154 
155  inline utf8_const_iterator operator--(int junk) {
156  utf8_const_iterator copy(this->raw_iterator_);
157  --(*this);
158  return copy;
159  }
160  //@}
161 
162  //@{
163  /**
164  * Compare with another iterator
165  */
166  inline bool operator==(const utf8_const_iterator& rhs) const {
167  char c;
168  utf8_const_iterator copy(*this);
169  c = *(copy.raw_iterator_);
170  copy.rewind_to_start_of_codepoint(c);
171  c = *(copy.raw_iterator_);
172  int size = utf8_util::codepoint_size(c);
173  while (size) {
174  if (copy.raw_iterator_ == rhs.raw_iterator_) {
175  return true;
176  }
177  ++(copy.raw_iterator_);
178  --size;
179  }
180  return false;
181  }
182 
183  inline bool operator!=(const utf8_const_iterator& rhs) const {
184  return !(*this == rhs);
185  }
186  //@}
187 
188  /**
189  * Dereference the current codepoint out of the iterator
190  */
191  inline const codepoint operator*() {
192  return this->current_codepoint();
193  }
194 
195  };
196 
197  /**
198  * \brief mutable utf8 iterator
199  *
200  * Note that if you set a value in the middle of a text, you will
201  * likely make the string invalid. Most of the time you should only
202  * consider appending to an iterator, never writing in the middle of
203  * the text.
204  * \tparam NativeIterator The underlying type to be iterated over.
205  */
206  template <typename NativeIterator>
208  : public utf8_iterator_base<NativeIterator> {
209  public:
210  /**
211  * Offer itself as the pointer type
212  */
214 
215  /**
216  * Construct fro the underlying iterator
217  */
218  inline utf8_iterator(const NativeIterator raw_iterator)
219  : utf8_iterator_base<NativeIterator>(raw_iterator) {};
220 
221  /**
222  * Copy constructor
223  */
224  inline utf8_iterator(const utf8_iterator& tocopy)
225  : utf8_iterator_base<NativeIterator>(tocopy.raw_iterator_) {};
226 
227  //@{
228  /**
229  * Advance the iterator
230  */
231  inline utf8_iterator& operator++() {
232  this->forward_one_codepoint();
233  return *this;
234  }
235 
236  inline utf8_iterator operator++(int junk) {
237  utf8_iterator copy(this->raw_iterator_);
238  ++(*this);
239  return copy;
240  }
241  //@}
242 
243  //@{
244  /**
245  * Rewind the iterator
246  */
247  inline utf8_iterator& operator--() {
248  this->rewind_one_codepoint();
249  return *this;
250  }
251 
252  inline utf8_iterator operator--(int junk) {
253  utf8_iterator copy(this->raw_iterator_);
254  --(*this);
255  return copy;
256  }
257  //@}
258 
259  //@{
260  /**
261  * Compare the iterator with another iterator
262  */
263  inline bool operator==(const utf8_iterator& rhs) const {
264  char c;
265  utf8_iterator copy(*this);
266  c = *(copy.raw_iterator_);
267  copy.rewind_to_start_of_codepoint(c);
268  c = *(copy.raw_iterator_);
269  int size = utf8_util::codepoint_size(c);
270  while (size) {
271  if (copy.raw_iterator_ == rhs.raw_iterator_) {
272  return true;
273  }
274  ++(copy.raw_iterator_);
275  --size;
276  }
277  return false;
278  }
279 
280  inline bool operator!=(const utf8_iterator& rhs) const {
281  return !(*this == rhs);
282  }
283  //@}
284 
285  /**
286  * \brief offers write access to the iterator at a given position
287  *
288  * This is necessary because operator= can only be done after
289  * operator* is executed, this wouldn't be necessary if there was
290  * a dedicated operator for 'assign to the dereference'.
291  */
292  class proxyobject : public codepoint {
293  private:
294  /**
295  * A proxy object refers to an iterator state
296  */
297  utf8_iterator<NativeIterator>& ref;
298  public:
299 
300  /**
301  * Create from the iterator
302  */
303  proxyobject(utf8_iterator<NativeIterator>& refin)
304  :ref(refin) {
305  utf8_iterator<NativeIterator> copy = refin;
306  value = copy.current_codepoint().value;
307  };
308 
309  /**
310  * Assign a codepoint to this position, writing as many octets
311  * as necessary. Note that if you do this in the middle of a
312  * string, there is a likely chance that you will render the
313  * remainder of the string invalid. So it's really only a good
314  * idea to do this as an "append" operation.
315  */
317  int value = c.value; // operate on codepoint as integer
318  int size = utf8_util::encoded_size(value);
319  if (size <= 1) {
320  *(ref.raw_iterator_) = (value & 0xFF);
321  } else {
322  unsigned char first_octet = (0xFF<<(8-size));
323  first_octet |= ((value>>((size-1)*6)) & 0xFF);
324  *(ref.raw_iterator_) = first_octet;
325  while (--size) {
326  unsigned char octet = 0b10000000;
327  octet |= ((value>>((size-1)*6)) & 0b00111111);
328  ref.raw_iterator_++;
329  *(ref.raw_iterator_) = octet;
330  }
331  }
332  return *this;
333  }
334  };
335 
336  /**
337  * mutable utf8 iterator returns a proxy object in order to allow
338  * assignment to happen.
339  */
340  inline proxyobject operator*() {
341  return proxyobject(*this);
342  }
343 
344  };
345 };
346 
347 #endif
NativeIterator raw_iterator_
bool operator==(const utf8_iterator &rhs) const
proxyobject & operator=(const codepoint c)
utf8_const_iterator(const utf8_const_iterator &tocopy)
utf8_iterator(const NativeIterator raw_iterator)
utf8_iterator(const utf8_iterator &tocopy)
main u5e namespace
std::bidirectional_iterator_tag iterator_category
Native representation of a codepoint.
Definition: codepoint.hpp:15
mutable utf8 iterator
utf8_const_iterator operator++(int junk)
const iterator for utf8 encoded strings.
Defines the basic inner workings of utf8 iterator.
bool rewind_to_start_of_codepoint(const char current_octet)
utf8_const_iterator(const NativeIterator raw_iterator)
proxyobject operator*()
static int encoded_size(int value)
Definition: utf8_util.hpp:60
iterator_assertion< NativeIterator, char > _assertions
utf8_const_iterator operator--(int junk)
utf8_iterator_base(const NativeIterator raw_iterator)
utf8_iterator & operator++()
bool operator==(const utf8_const_iterator &rhs) const
utf8_const_iterator & operator++()
utf8_iterator operator++(int junk)
utf8_const_iterator pointer
offers write access to the iterator at a given position
static bool is_7bit_character(const char octet)
Definition: utf8_util.hpp:17
utf8_iterator pointer
static bool is_codepoint_continuation(const char octet)
Definition: utf8_util.hpp:28
static int codepoint_size(const char first_octet)
Definition: utf8_util.hpp:50
const codepoint current_codepoint()
Basic operations necessary for implementing utf8.
Definition: utf8_util.hpp:11
bool operator!=(const utf8_const_iterator &rhs) const
proxyobject(utf8_iterator< NativeIterator > &refin)
utf8_iterator operator--(int junk)
utf8_iterator & operator--()
bool operator!=(const utf8_iterator &rhs) const
Asserts the iterator is consistently defined.
utf8_const_iterator & operator--()
const codepoint operator*()
codepoint_traits::int_type value
Definition: codepoint.hpp:20
const codepoint & reference