You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

308 lines
9.4 KiB

6 years ago
6 years ago
6 years ago
  1. /*
  2. * This program source code file is part of KiCad, a free EDA CAD application.
  3. *
  4. * Copyright (C) 2013 SoftPLC Corporation, Dick Hollenbeck <dick@softplc.com>
  5. * Copyright (C) 2013-2021 KiCad Developers, see AUTHORS.txt for contributors.
  6. *
  7. * @author Dick Hollenbeck
  8. *
  9. * This program is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU General Public License
  11. * as published by the Free Software Foundation; either version 2
  12. * of the License, or (at your option) any later version.
  13. *
  14. * This program is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17. * GNU General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU General Public License
  20. * along with this program; if not, you may find one here:
  21. * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
  22. * or you may search the http://www.gnu.org website for the version 2 license,
  23. * or you may write to the Free Software Foundation, Inc.,
  24. * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
  25. */
  26. #ifndef UTF8_H_
  27. #define UTF8_H_
  28. #include <string>
  29. #include <wx/string.h>
  30. #if defined(DEBUG)
  31. #define UTF8_VERIFY // Might someday be a hidden cmake config option
  32. #endif
  33. /**
  34. * Test a C string to see if it is UTF8 encoded.
  35. *
  36. * An ASCII string is a valid UTF8 string.
  37. */
  38. bool IsUTF8( const char* aString );
  39. #if defined(UTF8_VERIFY)
  40. #define MAYBE_VERIFY_UTF8(x) wxASSERT( IsUTF8(x) )
  41. #else
  42. #define MAYBE_VERIFY_UTF8(x) // nothing
  43. #endif
  44. /**
  45. * An 8 bit string that is assuredly encoded in UTF8, and supplies special conversion
  46. * support to and from wxString, to and from std::string, and has non-mutating iteration
  47. * over Unicode characters.
  48. *
  49. * I've been careful to supply only conversion facilities and not try and duplicate
  50. * wxString() with many member functions. There are multiple ways to create text into
  51. * a std::string without the need of too many member functions:
  52. *
  53. * - richio.h's StrPrintf().
  54. * - std::ostringstream.
  55. *
  56. * Because this class uses no virtuals, it should be possible to cast any std::string
  57. * into a UTF8 using this kind of cast: (UTF8 &) without construction or copying being
  58. * the effect of the cast. Be sure the source std::string holds UTF8 encoded text before
  59. * you do that.
  60. */
  61. class UTF8
  62. {
  63. public:
  64. UTF8( const wxString& o );
  65. /// This is a constructor for which you could end up with
  66. /// non-UTF8 encoding, but that would be your fault.
  67. UTF8( const char* txt ) :
  68. m_s( txt )
  69. {
  70. MAYBE_VERIFY_UTF8( c_str() );
  71. }
  72. /// For use with _() function on wx 2.8.
  73. /// BTW _() on wx >= 2.9 returns wxString, not wchar_t* like on 2.8.
  74. UTF8( const wchar_t* txt );
  75. UTF8( const std::string& o ) :
  76. m_s( o )
  77. {
  78. MAYBE_VERIFY_UTF8( c_str() );
  79. }
  80. UTF8()
  81. {
  82. }
  83. ~UTF8() // Needed mainly to build python wrapper
  84. {
  85. }
  86. // expose some std::string functions publicly, since base class must be private.
  87. const char* c_str() const { return m_s.c_str(); }
  88. bool empty() const { return m_s.empty(); }
  89. std::string::size_type find( char c ) const { return m_s.find( c ); }
  90. std::string::size_type find( char c, size_t s ) const { return m_s.find( c, s ); }
  91. void clear() { m_s.clear(); }
  92. std::string::size_type length() const { return m_s.length(); }
  93. std::string::size_type size() const { return m_s.size(); }
  94. int compare( const std::string& s ) const { return m_s.compare( s ); }
  95. bool operator==( const UTF8& rhs ) const { return m_s == rhs.m_s; }
  96. bool operator==( const std::string& rhs ) const { return m_s == rhs; }
  97. bool operator==( const char* s ) const { return m_s == s; }
  98. std::string::size_type find_first_of( const std::string& str,
  99. std::string::size_type pos = 0 ) const
  100. {
  101. return m_s.find_first_of( str, pos );
  102. }
  103. UTF8& operator+=( const UTF8& str )
  104. {
  105. m_s += str.m_s;
  106. MAYBE_VERIFY_UTF8( c_str() );
  107. return *this;
  108. }
  109. UTF8& operator+=( char ch )
  110. {
  111. m_s.operator+=( ch );
  112. MAYBE_VERIFY_UTF8( c_str() );
  113. return *this;
  114. }
  115. UTF8& operator+=( const char* s )
  116. {
  117. m_s.operator+=( s );
  118. MAYBE_VERIFY_UTF8( c_str() );
  119. return *this;
  120. }
  121. /// Append a wide (unicode) char to the UTF8 string.
  122. /// if this wide char is not a ASCII7 char, it will be added as a UTF8 multibyte sequence
  123. /// @param w_ch is a UTF-16 value (can be a UTF-32 on Linux)
  124. UTF8& operator+=( unsigned w_ch );
  125. // std::string::npos is not constexpr, so we can't use it in an
  126. // initializer.
  127. static constexpr std::string::size_type npos = -1;
  128. UTF8& operator=( const wxString& o );
  129. UTF8& operator=( const std::string& o )
  130. {
  131. m_s = o;
  132. MAYBE_VERIFY_UTF8( c_str() );
  133. return *this;
  134. }
  135. UTF8& operator=( const char* s )
  136. {
  137. m_s = s;
  138. MAYBE_VERIFY_UTF8( c_str() );
  139. return *this;
  140. }
  141. UTF8& operator=( char c )
  142. {
  143. m_s = c;
  144. MAYBE_VERIFY_UTF8( c_str() );
  145. return *this;
  146. }
  147. // a substring of a UTF8 is not necessarily a UTF8 if a multibyte character
  148. // was split, so return std::string not UTF8
  149. std::string substr( size_t pos = 0, size_t len = npos ) const
  150. {
  151. return m_s.substr( pos, len );
  152. }
  153. operator const std::string& () const { return m_s; }
  154. //operator std::string& () { return m_s; }
  155. //operator std::string () const { return m_s; }
  156. wxString wx_str() const;
  157. operator wxString () const;
  158. // "Read only" iterating over bytes is done with these, use the uni_iter to iterate
  159. // over UTF8 (multi-byte) characters
  160. std::string::const_iterator begin() const { return m_s.begin(); }
  161. std::string::const_iterator end() const { return m_s.end(); }
  162. #ifndef SWIG
  163. /**
  164. * uni_iter
  165. * is a non-mutating iterator that walks through unicode code points in the UTF8 encoded
  166. * string. The normal ++(), ++(int), ->(), and *() operators are all supported
  167. * for read only access and some return an unsigned holding the unicode character
  168. * appropriate for the respective operator.
  169. */
  170. class uni_iter
  171. {
  172. public:
  173. uni_iter() // Needed only to build python wrapper, not used outside the wrapper
  174. {
  175. it = nullptr;
  176. }
  177. uni_iter( const uni_iter& o )
  178. {
  179. it = o.it;
  180. }
  181. /// pre-increment and return uni_iter at new position
  182. const uni_iter& operator++()
  183. {
  184. it += uni_forward( it );
  185. return *this;
  186. }
  187. /// post-increment and return uni_iter at initial position
  188. uni_iter operator++( int )
  189. {
  190. uni_iter ret = *this;
  191. it += uni_forward( it );
  192. return ret;
  193. }
  194. /// return unicode at current position
  195. unsigned operator->() const
  196. {
  197. unsigned result;
  198. // grab the result, do not advance
  199. uni_forward( it, &result );
  200. return result;
  201. }
  202. /// return unicode at current position
  203. unsigned operator*() const
  204. {
  205. unsigned result;
  206. // grab the result, do not advance
  207. uni_forward( it, &result );
  208. return result;
  209. }
  210. uni_iter operator-( int aVal ) const { return uni_iter( (char*) it - aVal ); }
  211. bool operator==( const uni_iter& other ) const { return it == other.it; }
  212. bool operator!=( const uni_iter& other ) const { return it != other.it; }
  213. /// Since the ++ operators advance more than one byte, this is your best
  214. /// loop termination test, < end(), not == end().
  215. bool operator< ( const uni_iter& other ) const { return it < other.it; }
  216. bool operator<=( const uni_iter& other ) const { return it <= other.it; }
  217. bool operator> ( const uni_iter& other ) const { return it > other.it; }
  218. bool operator>=( const uni_iter& other ) const { return it >= other.it; }
  219. private:
  220. friend class UTF8;
  221. const unsigned char* it;
  222. // private constructor
  223. uni_iter( const char* start ) :
  224. it( (const unsigned char*) start )
  225. {
  226. }
  227. };
  228. /**
  229. * Returns a @a uni_iter initialized to the start of "this" UTF8 byte sequence.
  230. */
  231. uni_iter ubegin() const
  232. {
  233. return uni_iter( m_s.data() );
  234. }
  235. /**
  236. * Return a @a uni_iter initialized to the end of "this" UTF8 byte sequence.
  237. */
  238. uni_iter uend() const
  239. {
  240. return uni_iter( m_s.data() + m_s.size() );
  241. }
  242. /**
  243. * Advance over a single UTF8 encoded multibyte character, capturing the Unicode character
  244. * as it goes, and returning the number of bytes consumed.
  245. *
  246. * @param aSequence is the UTF8 byte sequence, must be aligned on start of character.
  247. * @param aResult is where to put the unicode character, and may be NULL if no interest.
  248. * @return the count of bytes consumed.
  249. */
  250. static int uni_forward( const unsigned char* aSequence, unsigned* aResult = nullptr );
  251. #endif // SWIG
  252. protected:
  253. std::string m_s;
  254. };
  255. #endif // UTF8_H_