u5e
UnicodeTextC++Library
utf8_bounds.hpp
1 #ifndef INCLUDED_U5E_UTF8_BOUNDS
2 #define INCLUDED_U5E_UTF8_BOUNDS
3 
4 #include <cmath>
5 #include <u5e/utf8_util.hpp>
6 #include <u5e/iterator_assertion.hpp>
7 
8 namespace u5e {
9  /**
10  * \brief Check and enforce bounds of utf8 text
11  *
12  * This will only look at the last 6 octets of the text and will
13  * only look at the first octet. It will not guarantee that the
14  * entire text is valid. The intent of this class is to provide a
15  * cheap safety check to make sure you will not have any under or
16  * overflow when processing this text.
17  *
18  * \tparam NativeIterator The native type to be iterated over.
19  */
20  template <typename NativeIterator>
21  class utf8_bounds {
22  public:
23  /**
24  * The NativeIterator must match the attributes of char
25  */
26  iterator_assertion<NativeIterator, char> _assertions;
27 
28  /**
29  * Check the bounds of the utf8 text, returns true if the text has
30  * correct bounds.
31  */
32  static bool check(NativeIterator begin, NativeIterator end) {
34  return false;
35  } else {
36  int max_walkback = 6; // mathematically, it's impossible for
37  // something more than 6 elements away
38  // from the end to generate a overflow.
39  int walkback = 0;
40  while (walkback < max_walkback && end != begin) {
41  char octet = *end;
43  if (utf8_util::codepoint_size(octet) > walkback) {
44  return false;
45  }
46  }
47  --end; walkback++;
48  }
49  return true;
50  }
51  }
52 
53  /**
54  * Enforce the bounds of the utf8 text, replace any bad character
55  * in the bounds by '?. Returns false if any substitution was made.
56  */
57  static bool enforce(NativeIterator begin, NativeIterator end) {
58  bool ret = true;
60  *begin = '?';
61  ++begin;
62  ret = false;
63  }
64  int max_walkback = 6; // mathematically, it's impossible for
65  // something more than 6 elements away
66  // from the end to generate a overflow.
67  int walkback = 0;
68  while (walkback < max_walkback && end != begin) {
69  char octet = *end;
71  if (utf8_util::codepoint_size(octet) > walkback) {
72  *end = '?';
73  ret = false;
74  }
75  }
76  --end; walkback++;
77  }
78  return ret;
79  }
80  };
81 }
82 
83 #endif
static bool is_codepoint_start(const char octet)
Definition: utf8_util.hpp:39
main u5e namespace
static bool check(NativeIterator begin, NativeIterator end)
Definition: utf8_bounds.hpp:32
static bool enforce(NativeIterator begin, NativeIterator end)
Definition: utf8_bounds.hpp:57
Check and enforce bounds of utf8 text.
Definition: utf8_bounds.hpp:21
static bool is_codepoint_continuation(const char octet)
Definition: utf8_util.hpp:28
static int codepoint_size(const char first_octet)
Definition: utf8_util.hpp:50
iterator_assertion< NativeIterator, char > _assertions
Definition: utf8_bounds.hpp:26
Basic operations necessary for implementing utf8.
Definition: utf8_util.hpp:11
Asserts the iterator is consistently defined.