2 Basic Unicode string class for Irrlicht.
3 Copyright (c) 2009-2011 John Norman
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any
7 damages arising from the use of this software.
9 Permission is granted to anyone to use this software for any
10 purpose, including commercial applications, and to alter it and
11 redistribute it freely, subject to the following restrictions:
13 1. The origin of this software must not be misrepresented; you
14 must not claim that you wrote the original software. If you use
15 this software in a product, an acknowledgment in the product
16 documentation would be appreciated but is not required.
18 2. Altered source versions must be plainly marked as such, and
19 must not be misrepresented as being the original software.
21 3. This notice may not be removed or altered from any source
24 The original version of this class can be located at:
25 http://irrlicht.suckerfreegames.com/
28 john@suckerfreegames.com
39 #define __BYTE_ORDER 0
40 #define __LITTLE_ENDIAN 0
41 #define __BIG_ENDIAN 1
42 #elif defined(__MACH__) && defined(__APPLE__)
43 #include <machine/endian.h>
44 #elif defined(__FreeBSD__) || defined(__DragonFly__)
45 #include <sys/endian.h>
57 #include "irrAllocator.h"
60 #include "irrString.h"
63 //! UTF-16 surrogate start values.
64 static const irr::u16 UTF16_HI_SURROGATE = 0xD800;
65 static const irr::u16 UTF16_LO_SURROGATE = 0xDC00;
67 //! Is a UTF-16 code point a surrogate?
68 #define UTF16_IS_SURROGATE(c) (((c) & 0xF800) == 0xD800)
69 #define UTF16_IS_SURROGATE_HI(c) (((c) & 0xFC00) == 0xD800)
70 #define UTF16_IS_SURROGATE_LO(c) (((c) & 0xFC00) == 0xDC00)
76 // Define our character types.
77 typedef char32_t uchar32_t;
78 typedef char16_t uchar16_t;
79 typedef char uchar8_t;
87 //! The unicode replacement character. Used to replace invalid characters.
88 const irr::u16 UTF_REPLACEMENT_CHARACTER = 0xFFFD;
90 //! Convert a UTF-16 surrogate pair into a UTF-32 character.
91 //! \param high The high value of the pair.
92 //! \param low The low value of the pair.
93 //! \return The UTF-32 character expressed by the surrogate pair.
94 inline uchar32_t toUTF32(uchar16_t high, uchar16_t low)
96 // Convert the surrogate pair into a single UTF-32 character.
97 uchar32_t x = ((high & ((1 << 6) -1)) << 10) | (low & ((1 << 10) -1));
98 uchar32_t wu = ((high >> 6) & ((1 << 5) - 1)) + 1;
99 return (wu << 16) | x;
102 //! Swaps the endianness of a 16-bit value.
103 //! \return The new value.
104 inline uchar16_t swapEndian16(const uchar16_t& c)
106 return ((c >> 8) & 0x00FF) | ((c << 8) & 0xFF00);
109 //! Swaps the endianness of a 32-bit value.
110 //! \return The new value.
111 inline uchar32_t swapEndian32(const uchar32_t& c)
113 return ((c >> 24) & 0x000000FF) |
114 ((c >> 8) & 0x0000FF00) |
115 ((c << 8) & 0x00FF0000) |
116 ((c << 24) & 0xFF000000);
119 //! The Unicode byte order mark.
120 const u16 BOM = 0xFEFF;
122 //! The size of the Unicode byte order mark in terms of the Unicode character size.
123 const u8 BOM_UTF8_LEN = 3;
124 const u8 BOM_UTF16_LEN = 1;
125 const u8 BOM_UTF32_LEN = 1;
127 //! Unicode byte order marks for file operations.
128 const u8 BOM_ENCODE_UTF8[3] = { 0xEF, 0xBB, 0xBF };
129 const u8 BOM_ENCODE_UTF16_BE[2] = { 0xFE, 0xFF };
130 const u8 BOM_ENCODE_UTF16_LE[2] = { 0xFF, 0xFE };
131 const u8 BOM_ENCODE_UTF32_BE[4] = { 0x00, 0x00, 0xFE, 0xFF };
132 const u8 BOM_ENCODE_UTF32_LE[4] = { 0xFF, 0xFE, 0x00, 0x00 };
134 //! The size in bytes of the Unicode byte marks for file operations.
135 const u8 BOM_ENCODE_UTF8_LEN = 3;
136 const u8 BOM_ENCODE_UTF16_LEN = 2;
137 const u8 BOM_ENCODE_UTF32_LEN = 4;
139 //! Unicode encoding type.
152 //! Unicode endianness.
160 //! Returns the specified unicode byte order mark in a byte array.
161 //! The byte order mark is the first few bytes in a text file that signifies its encoding.
162 /** \param mode The Unicode encoding method that we want to get the byte order mark for.
163 If EUTFE_UTF16 or EUTFE_UTF32 is passed, it uses the native system endianness. **/
164 //! \return An array that contains a byte order mark.
165 inline core::array<u8> getUnicodeBOM(EUTF_ENCODE mode)
167 #define COPY_ARRAY(source, size) \
168 memcpy(ret.pointer(), source, size); \
171 core::array<u8> ret(4);
175 COPY_ARRAY(BOM_ENCODE_UTF8, BOM_ENCODE_UTF8_LEN);
178 #ifdef __BIG_ENDIAN__
179 COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
181 COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
185 COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
188 COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
191 #ifdef __BIG_ENDIAN__
192 COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
194 COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
198 COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
201 COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
204 // TODO sapier: fixed warning only,
205 // don't know if something needs to be done here
213 //! Detects if the given data stream starts with a unicode BOM.
214 //! \param data The data stream to check.
215 //! \return The unicode BOM associated with the data stream, or EUTFE_NONE if none was found.
216 inline EUTF_ENCODE determineUnicodeBOM(const char* data)
218 if (memcmp(data, BOM_ENCODE_UTF8, 3) == 0) return EUTFE_UTF8;
219 if (memcmp(data, BOM_ENCODE_UTF16_BE, 2) == 0) return EUTFE_UTF16_BE;
220 if (memcmp(data, BOM_ENCODE_UTF16_LE, 2) == 0) return EUTFE_UTF16_LE;
221 if (memcmp(data, BOM_ENCODE_UTF32_BE, 4) == 0) return EUTFE_UTF32_BE;
222 if (memcmp(data, BOM_ENCODE_UTF32_LE, 4) == 0) return EUTFE_UTF32_LE;
226 } // end namespace unicode
229 //! UTF-16 string class.
230 template <typename TAlloc = irrAllocator<uchar16_t> >
235 ///------------------///
236 /// iterator classes ///
237 ///------------------///
239 //! Access an element in a unicode string, allowing one to change it.
240 class _ustring16_iterator_access
243 _ustring16_iterator_access(const ustring16<TAlloc>* s, u32 p) : ref(s), pos(p) {}
245 //! Allow the class to be interpreted as a single UTF-32 character.
246 operator uchar32_t() const
251 //! Allow one to change the character in the unicode string.
252 //! \param c The new character to use.
254 _ustring16_iterator_access& operator=(const uchar32_t c)
260 //! Increments the value by 1.
262 _ustring16_iterator_access& operator++()
268 //! Increments the value by 1, returning the old value.
269 //! \return A unicode character.
270 uchar32_t operator++(int)
272 uchar32_t old = _get();
277 //! Decrements the value by 1.
279 _ustring16_iterator_access& operator--()
285 //! Decrements the value by 1, returning the old value.
286 //! \return A unicode character.
287 uchar32_t operator--(int)
289 uchar32_t old = _get();
294 //! Adds to the value by a specified amount.
295 //! \param val The amount to add to this character.
297 _ustring16_iterator_access& operator+=(int val)
303 //! Subtracts from the value by a specified amount.
304 //! \param val The amount to subtract from this character.
306 _ustring16_iterator_access& operator-=(int val)
312 //! Multiples the value by a specified amount.
313 //! \param val The amount to multiply this character by.
315 _ustring16_iterator_access& operator*=(int val)
321 //! Divides the value by a specified amount.
322 //! \param val The amount to divide this character by.
324 _ustring16_iterator_access& operator/=(int val)
330 //! Modulos the value by a specified amount.
331 //! \param val The amount to modulo this character by.
333 _ustring16_iterator_access& operator%=(int val)
339 //! Adds to the value by a specified amount.
340 //! \param val The amount to add to this character.
341 //! \return A unicode character.
342 uchar32_t operator+(int val) const
347 //! Subtracts from the value by a specified amount.
348 //! \param val The amount to subtract from this character.
349 //! \return A unicode character.
350 uchar32_t operator-(int val) const
355 //! Multiplies the value by a specified amount.
356 //! \param val The amount to multiply this character by.
357 //! \return A unicode character.
358 uchar32_t operator*(int val) const
363 //! Divides the value by a specified amount.
364 //! \param val The amount to divide this character by.
365 //! \return A unicode character.
366 uchar32_t operator/(int val) const
371 //! Modulos the value by a specified amount.
372 //! \param val The amount to modulo this character by.
373 //! \return A unicode character.
374 uchar32_t operator%(int val) const
380 //! Gets a uchar32_t from our current position.
381 uchar32_t _get() const
383 const uchar16_t* a = ref->c_str();
384 if (!UTF16_IS_SURROGATE(a[pos]))
385 return static_cast<uchar32_t>(a[pos]);
388 if (pos + 1 >= ref->size_raw())
391 return unicode::toUTF32(a[pos], a[pos + 1]);
395 //! Sets a uchar32_t at our current position.
396 void _set(uchar32_t c)
398 ustring16<TAlloc>* ref2 = const_cast<ustring16<TAlloc>*>(ref);
399 const uchar16_t* a = ref2->c_str();
402 // c will be multibyte, so split it up into the high and low surrogate pairs.
403 uchar16_t x = static_cast<uchar16_t>(c);
404 uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
405 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
407 // If the previous position was a surrogate pair, just replace them. Else, insert the low pair.
408 if (UTF16_IS_SURROGATE_HI(a[pos]) && pos + 1 != ref2->size_raw())
409 ref2->replace_raw(vl, static_cast<u32>(pos) + 1);
410 else ref2->insert_raw(vl, static_cast<u32>(pos) + 1);
412 ref2->replace_raw(vh, static_cast<u32>(pos));
416 // c will be a single byte.
417 uchar16_t vh = static_cast<uchar16_t>(c);
419 // If the previous position was a surrogate pair, remove the extra byte.
420 if (UTF16_IS_SURROGATE_HI(a[pos]))
421 ref2->erase_raw(static_cast<u32>(pos) + 1);
423 ref2->replace_raw(vh, static_cast<u32>(pos));
427 const ustring16<TAlloc>* ref;
430 typedef typename ustring16<TAlloc>::_ustring16_iterator_access access;
433 //! Iterator to iterate through a UTF-16 string.
434 class _ustring16_const_iterator : public std::iterator<
435 std::bidirectional_iterator_tag, // iterator_category
436 access, // value_type
437 ptrdiff_t, // difference_type
438 const access, // pointer
439 const access // reference
443 typedef _ustring16_const_iterator _Iter;
444 typedef std::iterator<std::bidirectional_iterator_tag, access, ptrdiff_t, const access, const access> _Base;
445 typedef const access const_pointer;
446 typedef const access const_reference;
448 typedef typename _Base::value_type value_type;
449 typedef typename _Base::difference_type difference_type;
450 typedef typename _Base::difference_type distance_type;
451 typedef typename _Base::pointer pointer;
452 typedef const_reference reference;
455 _ustring16_const_iterator(const _Iter& i) : ref(i.ref), pos(i.pos) {}
456 _ustring16_const_iterator(const ustring16<TAlloc>& s) : ref(&s), pos(0) {}
457 _ustring16_const_iterator(const ustring16<TAlloc>& s, const u32 p) : ref(&s), pos(0)
459 if (ref->size_raw() == 0 || p == 0)
462 // Go to the appropriate position.
464 u32 sr = ref->size_raw();
465 const uchar16_t* a = ref->c_str();
466 while (i != 0 && pos < sr)
468 if (UTF16_IS_SURROGATE_HI(a[pos]))
475 //! Test for equalness.
476 bool operator==(const _Iter& iter) const
478 if (ref == iter.ref && pos == iter.pos)
483 //! Test for unequalness.
484 bool operator!=(const _Iter& iter) const
486 if (ref != iter.ref || pos != iter.pos)
491 //! Switch to the next full character in the string.
494 if (pos == ref->size_raw()) return *this;
495 const uchar16_t* a = ref->c_str();
496 if (UTF16_IS_SURROGATE_HI(a[pos]))
497 pos += 2; // TODO: check for valid low surrogate?
499 if (pos > ref->size_raw()) pos = ref->size_raw();
503 //! Switch to the next full character in the string, returning the previous position.
504 _Iter operator++(int)
511 //! Switch to the previous full character in the string.
514 if (pos == 0) return *this;
515 const uchar16_t* a = ref->c_str();
517 if (UTF16_IS_SURROGATE_LO(a[pos]) && pos != 0) // low surrogate, go back one more.
522 //! Switch to the previous full character in the string, returning the previous position.
523 _Iter operator--(int)
530 //! Advance a specified number of full characters in the string.
532 _Iter& operator+=(const difference_type v)
534 if (v == 0) return *this;
535 if (v < 0) return operator-=(v * -1);
537 if (pos >= ref->size_raw())
540 // Go to the appropriate position.
541 // TODO: Don't force u32 on an x64 OS. Make it agnostic.
543 u32 sr = ref->size_raw();
544 const uchar16_t* a = ref->c_str();
545 while (i != 0 && pos < sr)
547 if (UTF16_IS_SURROGATE_HI(a[pos]))
558 //! Go back a specified number of full characters in the string.
560 _Iter& operator-=(const difference_type v)
562 if (v == 0) return *this;
563 if (v > 0) return operator+=(v * -1);
568 // Go to the appropriate position.
569 // TODO: Don't force u32 on an x64 OS. Make it agnostic.
571 const uchar16_t* a = ref->c_str();
572 while (i != 0 && pos != 0)
575 if (UTF16_IS_SURROGATE_LO(a[pos]) != 0 && pos != 0)
583 //! Return a new iterator that is a variable number of full characters forward from the current position.
584 _Iter operator+(const difference_type v) const
591 //! Return a new iterator that is a variable number of full characters backward from the current position.
592 _Iter operator-(const difference_type v) const
599 //! Returns the distance between two iterators.
600 difference_type operator-(const _Iter& iter) const
602 // Make sure we reference the same object!
604 return difference_type();
629 //! Accesses the full character at the iterator's position.
630 const_reference operator*() const
632 if (pos >= ref->size_raw())
634 const uchar16_t* a = ref->c_str();
635 u32 p = ref->size_raw();
636 if (UTF16_IS_SURROGATE_LO(a[p]))
638 reference ret(ref, p);
641 const_reference ret(ref, pos);
645 //! Accesses the full character at the iterator's position.
646 reference operator*()
648 if (pos >= ref->size_raw())
650 const uchar16_t* a = ref->c_str();
651 u32 p = ref->size_raw();
652 if (UTF16_IS_SURROGATE_LO(a[p]))
654 reference ret(ref, p);
657 reference ret(ref, pos);
661 //! Accesses the full character at the iterator's position.
662 const_pointer operator->() const
667 //! Accesses the full character at the iterator's position.
673 //! Is the iterator at the start of the string?
679 //! Is the iterator at the end of the string?
682 const uchar16_t* a = ref->c_str();
683 if (UTF16_IS_SURROGATE(a[pos]))
684 return (pos + 1) >= ref->size_raw();
685 else return pos >= ref->size_raw();
688 //! Moves the iterator to the start of the string.
694 //! Moves the iterator to the end of the string.
697 pos = ref->size_raw();
700 //! Returns the iterator's position.
701 //! \return The iterator's position.
708 const ustring16<TAlloc>* ref;
712 //! Iterator to iterate through a UTF-16 string.
713 class _ustring16_iterator : public _ustring16_const_iterator
716 typedef _ustring16_iterator _Iter;
717 typedef _ustring16_const_iterator _Base;
718 typedef typename _Base::const_pointer const_pointer;
719 typedef typename _Base::const_reference const_reference;
722 typedef typename _Base::value_type value_type;
723 typedef typename _Base::difference_type difference_type;
724 typedef typename _Base::distance_type distance_type;
725 typedef access pointer;
726 typedef access reference;
732 _ustring16_iterator(const _Iter& i) : _ustring16_const_iterator(i) {}
733 _ustring16_iterator(const ustring16<TAlloc>& s) : _ustring16_const_iterator(s) {}
734 _ustring16_iterator(const ustring16<TAlloc>& s, const u32 p) : _ustring16_const_iterator(s, p) {}
736 //! Accesses the full character at the iterator's position.
737 reference operator*() const
739 if (pos >= ref->size_raw())
741 const uchar16_t* a = ref->c_str();
742 u32 p = ref->size_raw();
743 if (UTF16_IS_SURROGATE_LO(a[p]))
745 reference ret(ref, p);
748 reference ret(ref, pos);
752 //! Accesses the full character at the iterator's position.
753 reference operator*()
755 if (pos >= ref->size_raw())
757 const uchar16_t* a = ref->c_str();
758 u32 p = ref->size_raw();
759 if (UTF16_IS_SURROGATE_LO(a[p]))
761 reference ret(ref, p);
764 reference ret(ref, pos);
768 //! Accesses the full character at the iterator's position.
769 pointer operator->() const
774 //! Accesses the full character at the iterator's position.
781 typedef typename ustring16<TAlloc>::_ustring16_iterator iterator;
782 typedef typename ustring16<TAlloc>::_ustring16_const_iterator const_iterator;
784 ///----------------------///
785 /// end iterator classes ///
786 ///----------------------///
788 //! Default constructor
790 : array(0), allocated(1), used(0)
792 #if __BYTE_ORDER == __BIG_ENDIAN
793 encoding = unicode::EUTFE_UTF16_BE;
795 encoding = unicode::EUTFE_UTF16_LE;
797 array = allocator.allocate(1); // new u16[1];
803 ustring16(const ustring16<TAlloc>& other)
804 : array(0), allocated(0), used(0)
806 #if __BYTE_ORDER == __BIG_ENDIAN
807 encoding = unicode::EUTFE_UTF16_BE;
809 encoding = unicode::EUTFE_UTF16_LE;
815 //! Constructor from other string types
817 ustring16(const string<B>& other)
818 : array(0), allocated(0), used(0)
820 #if __BYTE_ORDER == __BIG_ENDIAN
821 encoding = unicode::EUTFE_UTF16_BE;
823 encoding = unicode::EUTFE_UTF16_LE;
828 //! Constructor from std::string
829 template <class B, class A, typename Alloc>
830 ustring16(const std::basic_string<B, A, Alloc>& other)
831 : array(0), allocated(0), used(0)
833 #if __BYTE_ORDER == __BIG_ENDIAN
834 encoding = unicode::EUTFE_UTF16_BE;
836 encoding = unicode::EUTFE_UTF16_LE;
838 *this = other.c_str();
842 //! Constructor from iterator.
843 template <typename Itr>
844 ustring16(Itr first, Itr last)
845 : array(0), allocated(0), used(0)
847 #if __BYTE_ORDER == __BIG_ENDIAN
848 encoding = unicode::EUTFE_UTF16_BE;
850 encoding = unicode::EUTFE_UTF16_LE;
852 reserve(std::distance(first, last));
855 for (; first != last; ++first)
856 append((uchar32_t)*first);
859 //! Constructor for copying a UTF-8 string from a pointer.
860 ustring16(const uchar8_t* const c)
861 : array(0), allocated(0), used(0)
863 #if __BYTE_ORDER == __BIG_ENDIAN
864 encoding = unicode::EUTFE_UTF16_BE;
866 encoding = unicode::EUTFE_UTF16_LE;
873 //! Constructor for copying a UTF-8 string from a single char.
874 ustring16(const char c)
875 : array(0), allocated(0), used(0)
877 #if __BYTE_ORDER == __BIG_ENDIAN
878 encoding = unicode::EUTFE_UTF16_BE;
880 encoding = unicode::EUTFE_UTF16_LE;
883 append((uchar32_t)c);
887 //! Constructor for copying a UTF-8 string from a pointer with a given length.
888 ustring16(const uchar8_t* const c, u32 length)
889 : array(0), allocated(0), used(0)
891 #if __BYTE_ORDER == __BIG_ENDIAN
892 encoding = unicode::EUTFE_UTF16_BE;
894 encoding = unicode::EUTFE_UTF16_LE;
901 //! Constructor for copying a UTF-16 string from a pointer.
902 ustring16(const uchar16_t* const c)
903 : array(0), allocated(0), used(0)
905 #if __BYTE_ORDER == __BIG_ENDIAN
906 encoding = unicode::EUTFE_UTF16_BE;
908 encoding = unicode::EUTFE_UTF16_LE;
915 //! Constructor for copying a UTF-16 string from a pointer with a given length
916 ustring16(const uchar16_t* const c, u32 length)
917 : array(0), allocated(0), used(0)
919 #if __BYTE_ORDER == __BIG_ENDIAN
920 encoding = unicode::EUTFE_UTF16_BE;
922 encoding = unicode::EUTFE_UTF16_LE;
929 //! Constructor for copying a UTF-32 string from a pointer.
930 ustring16(const uchar32_t* const c)
931 : array(0), allocated(0), used(0)
933 #if __BYTE_ORDER == __BIG_ENDIAN
934 encoding = unicode::EUTFE_UTF16_BE;
936 encoding = unicode::EUTFE_UTF16_LE;
943 //! Constructor for copying a UTF-32 from a pointer with a given length.
944 ustring16(const uchar32_t* const c, u32 length)
945 : array(0), allocated(0), used(0)
947 #if __BYTE_ORDER == __BIG_ENDIAN
948 encoding = unicode::EUTFE_UTF16_BE;
950 encoding = unicode::EUTFE_UTF16_LE;
957 //! Constructor for copying a wchar_t string from a pointer.
958 ustring16(const wchar_t* const c)
959 : array(0), allocated(0), used(0)
961 #if __BYTE_ORDER == __BIG_ENDIAN
962 encoding = unicode::EUTFE_UTF16_BE;
964 encoding = unicode::EUTFE_UTF16_LE;
967 if (sizeof(wchar_t) == 4)
968 append(reinterpret_cast<const uchar32_t* const>(c));
969 else if (sizeof(wchar_t) == 2)
970 append(reinterpret_cast<const uchar16_t* const>(c));
971 else if (sizeof(wchar_t) == 1)
972 append(reinterpret_cast<const uchar8_t* const>(c));
976 //! Constructor for copying a wchar_t string from a pointer with a given length.
977 ustring16(const wchar_t* const c, u32 length)
978 : array(0), allocated(0), used(0)
980 #if __BYTE_ORDER == __BIG_ENDIAN
981 encoding = unicode::EUTFE_UTF16_BE;
983 encoding = unicode::EUTFE_UTF16_LE;
986 if (sizeof(wchar_t) == 4)
987 append(reinterpret_cast<const uchar32_t* const>(c), length);
988 else if (sizeof(wchar_t) == 2)
989 append(reinterpret_cast<const uchar16_t* const>(c), length);
990 else if (sizeof(wchar_t) == 1)
991 append(reinterpret_cast<const uchar8_t* const>(c), length);
995 //! Constructor for moving a ustring16
996 ustring16(ustring16<TAlloc>&& other)
997 : array(other.array), encoding(other.encoding), allocated(other.allocated), used(other.used)
999 //std::cout << "MOVE constructor" << std::endl;
1001 other.allocated = 0;
1008 allocator.deallocate(array); // delete [] array;
1012 //! Assignment operator
1013 ustring16& operator=(const ustring16<TAlloc>& other)
1018 used = other.size_raw();
1019 if (used >= allocated)
1021 allocator.deallocate(array); // delete [] array;
1022 allocated = used + 1;
1023 array = allocator.allocate(used + 1); //new u16[used];
1026 const uchar16_t* p = other.c_str();
1027 for (u32 i=0; i<=used; ++i, ++p)
1032 // Validate our new UTF-16 string.
1038 //! Move assignment operator
1039 ustring16& operator=(ustring16<TAlloc>&& other)
1043 //std::cout << "MOVE operator=" << std::endl;
1044 allocator.deallocate(array);
1046 array = other.array;
1047 allocated = other.allocated;
1048 encoding = other.encoding;
1056 //! Assignment operator for other string types
1058 ustring16<TAlloc>& operator=(const string<B>& other)
1060 *this = other.c_str();
1065 //! Assignment operator for UTF-8 strings
1066 ustring16<TAlloc>& operator=(const uchar8_t* const c)
1070 array = allocator.allocate(1); //new u16[1];
1075 if (!c) return *this;
1077 //! Append our string now.
1083 //! Assignment operator for UTF-16 strings
1084 ustring16<TAlloc>& operator=(const uchar16_t* const c)
1088 array = allocator.allocate(1); //new u16[1];
1093 if (!c) return *this;
1095 //! Append our string now.
1101 //! Assignment operator for UTF-32 strings
1102 ustring16<TAlloc>& operator=(const uchar32_t* const c)
1106 array = allocator.allocate(1); //new u16[1];
1111 if (!c) return *this;
1113 //! Append our string now.
1119 //! Assignment operator for wchar_t strings.
1120 /** Note that this assumes that a correct unicode string is stored in the wchar_t string.
1121 Since wchar_t changes depending on its platform, it could either be a UTF-8, -16, or -32 string.
1122 This function assumes you are storing the correct unicode encoding inside the wchar_t string. **/
1123 ustring16<TAlloc>& operator=(const wchar_t* const c)
1125 if (sizeof(wchar_t) == 4)
1126 *this = reinterpret_cast<const uchar32_t* const>(c);
1127 else if (sizeof(wchar_t) == 2)
1128 *this = reinterpret_cast<const uchar16_t* const>(c);
1129 else if (sizeof(wchar_t) == 1)
1130 *this = reinterpret_cast<const uchar8_t* const>(c);
1136 //! Assignment operator for other strings.
1137 /** Note that this assumes that a correct unicode string is stored in the string. **/
1139 ustring16<TAlloc>& operator=(const B* const c)
1142 *this = reinterpret_cast<const uchar32_t* const>(c);
1143 else if (sizeof(B) == 2)
1144 *this = reinterpret_cast<const uchar16_t* const>(c);
1145 else if (sizeof(B) == 1)
1146 *this = reinterpret_cast<const uchar8_t* const>(c);
1152 //! Direct access operator
1153 access operator [](const u32 index)
1155 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1156 iterator iter(*this, index);
1157 return iter.operator*();
1161 //! Direct access operator
1162 const access operator [](const u32 index) const
1164 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1165 const_iterator iter(*this, index);
1166 return iter.operator*();
1170 //! Equality operator
1171 bool operator ==(const uchar16_t* const str) const
1177 for(i=0; array[i] && str[i]; ++i)
1178 if (array[i] != str[i])
1181 return !array[i] && !str[i];
1185 //! Equality operator
1186 bool operator ==(const ustring16<TAlloc>& other) const
1188 for(u32 i=0; array[i] && other.array[i]; ++i)
1189 if (array[i] != other.array[i])
1192 return used == other.used;
1196 //! Is smaller comparator
1197 bool operator <(const ustring16<TAlloc>& other) const
1199 for(u32 i=0; array[i] && other.array[i]; ++i)
1201 s32 diff = array[i] - other.array[i];
1206 return used < other.used;
1210 //! Inequality operator
1211 bool operator !=(const uchar16_t* const str) const
1213 return !(*this == str);
1217 //! Inequality operator
1218 bool operator !=(const ustring16<TAlloc>& other) const
1220 return !(*this == other);
1224 //! Returns the length of a ustring16 in full characters.
1225 //! \return Length of a ustring16 in full characters.
1228 const_iterator i(*this, 0);
1239 //! Informs if the ustring is empty or not.
1240 //! \return True if the ustring is empty, false if not.
1243 return (size_raw() == 0);
1247 //! Returns a pointer to the raw UTF-16 string data.
1248 //! \return pointer to C-style NUL terminated array of UTF-16 code points.
1249 const uchar16_t* c_str() const
1255 //! Compares the first n characters of this string with another.
1256 //! \param other Other string to compare to.
1257 //! \param n Number of characters to compare.
1258 //! \return True if the n first characters of both strings are equal.
1259 bool equalsn(const ustring16<TAlloc>& other, u32 n) const
1262 const uchar16_t* oa = other.c_str();
1263 for(i=0; i < n && array[i] && oa[i]; ++i)
1264 if (array[i] != oa[i])
1267 // if one (or both) of the strings was smaller then they
1268 // are only equal if they have the same length
1269 return (i == n) || (used == other.used);
1273 //! Compares the first n characters of this string with another.
1274 //! \param str Other string to compare to.
1275 //! \param n Number of characters to compare.
1276 //! \return True if the n first characters of both strings are equal.
1277 bool equalsn(const uchar16_t* const str, u32 n) const
1282 for(i=0; i < n && array[i] && str[i]; ++i)
1283 if (array[i] != str[i])
1286 // if one (or both) of the strings was smaller then they
1287 // are only equal if they have the same length
1288 return (i == n) || (array[i] == 0 && str[i] == 0);
1292 //! Appends a character to this ustring16
1293 //! \param character The character to append.
1294 //! \return A reference to our current string.
1295 ustring16<TAlloc>& append(uchar32_t character)
1297 if (used + 2 >= allocated)
1298 reallocate(used + 2);
1300 if (character > 0xFFFF)
1304 // character will be multibyte, so split it up into a surrogate pair.
1305 uchar16_t x = static_cast<uchar16_t>(character);
1306 uchar16_t vh = UTF16_HI_SURROGATE | ((((character >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1307 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1314 array[used-1] = character;
1322 //! Appends a UTF-8 string to this ustring16
1323 //! \param other The UTF-8 string to append.
1324 //! \param length The length of the string to append.
1325 //! \return A reference to our current string.
1326 ustring16<TAlloc>& append(const uchar8_t* const other, u32 length=0xffffffff)
1331 // Determine if the string is long enough for a BOM.
1333 const uchar8_t* p = other;
1337 } while (*p++ && len < unicode::BOM_ENCODE_UTF8_LEN);
1340 unicode::EUTF_ENCODE c_bom = unicode::EUTFE_NONE;
1341 if (len == unicode::BOM_ENCODE_UTF8_LEN)
1343 if (memcmp(other, unicode::BOM_ENCODE_UTF8, unicode::BOM_ENCODE_UTF8_LEN) == 0)
1344 c_bom = unicode::EUTFE_UTF8;
1347 // If a BOM was found, don't include it in the string.
1348 const uchar8_t* c2 = other;
1349 if (c_bom != unicode::EUTFE_NONE)
1351 c2 = other + unicode::BOM_UTF8_LEN;
1352 length -= unicode::BOM_UTF8_LEN;
1355 // Calculate the size of the string to read in.
1361 } while(*p++ && len < length);
1365 // If we need to grow the array, do it now.
1366 if (used + len >= allocated)
1367 reallocate(used + (len * 2));
1370 // Convert UTF-8 to UTF-16.
1372 for (u32 l = 0; l<len;)
1375 if (((c2[l] >> 6) & 0x03) == 0x02)
1376 { // Invalid continuation byte.
1377 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1380 else if (c2[l] == 0xC0 || c2[l] == 0xC1)
1381 { // Invalid byte - overlong encoding.
1382 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1385 else if ((c2[l] & 0xF8) == 0xF0)
1386 { // 4 bytes UTF-8, 2 bytes UTF-16.
1387 // Check for a full string.
1390 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1398 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1399 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1400 if (valid && (((c2[l+3] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1403 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1409 uchar8_t b1 = ((c2[l] & 0x7) << 2) | ((c2[l+1] >> 4) & 0x3);
1410 uchar8_t b2 = ((c2[l+1] & 0xF) << 4) | ((c2[l+2] >> 2) & 0xF);
1411 uchar8_t b3 = ((c2[l+2] & 0x3) << 6) | (c2[l+3] & 0x3F);
1412 uchar32_t v = b3 | ((uchar32_t)b2 << 8) | ((uchar32_t)b1 << 16);
1414 // Split v up into a surrogate pair.
1415 uchar16_t x = static_cast<uchar16_t>(v);
1416 uchar16_t vh = UTF16_HI_SURROGATE | ((((v >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1417 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1422 ++used; // Using two shorts this time, so increase used by 1.
1424 else if ((c2[l] & 0xF0) == 0xE0)
1425 { // 3 bytes UTF-8, 1 byte UTF-16.
1426 // Check for a full string.
1429 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1437 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1438 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1441 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1447 uchar8_t b1 = ((c2[l] & 0xF) << 4) | ((c2[l+1] >> 2) & 0xF);
1448 uchar8_t b2 = ((c2[l+1] & 0x3) << 6) | (c2[l+2] & 0x3F);
1449 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1453 else if ((c2[l] & 0xE0) == 0xC0)
1454 { // 2 bytes UTF-8, 1 byte UTF-16.
1455 // Check for a full string.
1458 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1464 if (((c2[l+1] >> 6) & 0x03) != 0x02)
1466 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1472 uchar8_t b1 = (c2[l] >> 2) & 0x7;
1473 uchar8_t b2 = ((c2[l] & 0x3) << 6) | (c2[l+1] & 0x3F);
1474 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1479 { // 1 byte UTF-8, 1 byte UTF-16.
1482 { // Values above 0xF4 are restricted and aren't used. By now, anything above 0x7F is invalid.
1483 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1485 else array[pos++] = static_cast<uchar16_t>(c2[l]);
1491 // Validate our new UTF-16 string.
1498 //! Appends a UTF-16 string to this ustring16
1499 //! \param other The UTF-16 string to append.
1500 //! \param length The length of the string to append.
1501 //! \return A reference to our current string.
1502 ustring16<TAlloc>& append(const uchar16_t* const other, u32 length=0xffffffff)
1507 // Determine if the string is long enough for a BOM.
1509 const uchar16_t* p = other;
1513 } while (*p++ && len < unicode::BOM_ENCODE_UTF16_LEN);
1515 // Check for the BOM to determine the string's endianness.
1516 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1517 if (memcmp(other, unicode::BOM_ENCODE_UTF16_LE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1518 c_end = unicode::EUTFEE_LITTLE;
1519 else if (memcmp(other, unicode::BOM_ENCODE_UTF16_BE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1520 c_end = unicode::EUTFEE_BIG;
1522 // If a BOM was found, don't include it in the string.
1523 const uchar16_t* c2 = other;
1524 if (c_end != unicode::EUTFEE_NATIVE)
1526 c2 = other + unicode::BOM_UTF16_LEN;
1527 length -= unicode::BOM_UTF16_LEN;
1530 // Calculate the size of the string to read in.
1536 } while(*p++ && len < length);
1540 // If we need to grow the size of the array, do it now.
1541 if (used + len >= allocated)
1542 reallocate(used + (len * 2));
1546 // Copy the string now.
1547 unicode::EUTF_ENDIAN m_end = getEndianness();
1548 for (u32 l = start; l < start + len; ++l)
1550 array[l] = (uchar16_t)c2[l];
1551 if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1552 array[l] = unicode::swapEndian16(array[l]);
1557 // Validate our new UTF-16 string.
1563 //! Appends a UTF-32 string to this ustring16
1564 //! \param other The UTF-32 string to append.
1565 //! \param length The length of the string to append.
1566 //! \return A reference to our current string.
1567 ustring16<TAlloc>& append(const uchar32_t* const other, u32 length=0xffffffff)
1572 // Check for the BOM to determine the string's endianness.
1573 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1574 if (memcmp(other, unicode::BOM_ENCODE_UTF32_LE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1575 c_end = unicode::EUTFEE_LITTLE;
1576 else if (memcmp(other, unicode::BOM_ENCODE_UTF32_BE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1577 c_end = unicode::EUTFEE_BIG;
1579 // If a BOM was found, don't include it in the string.
1580 const uchar32_t* c2 = other;
1581 if (c_end != unicode::EUTFEE_NATIVE)
1583 c2 = other + unicode::BOM_UTF32_LEN;
1584 length -= unicode::BOM_UTF32_LEN;
1587 // Calculate the size of the string to read in.
1589 const uchar32_t* p = c2;
1593 } while(*p++ && len < length);
1597 // If we need to grow the size of the array, do it now.
1598 // In case all of the UTF-32 string is split into surrogate pairs, do len * 2.
1599 if (used + (len * 2) >= allocated)
1600 reallocate(used + ((len * 2) * 2));
1603 // Convert UTF-32 to UTF-16.
1604 unicode::EUTF_ENDIAN m_end = getEndianness();
1606 for (u32 l = 0; l<len; ++l)
1610 uchar32_t ch = c2[l];
1611 if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1612 ch = unicode::swapEndian32(ch);
1616 // Split ch up into a surrogate pair as it is over 16 bits long.
1617 uchar16_t x = static_cast<uchar16_t>(ch);
1618 uchar16_t vh = UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1619 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1622 ++used; // Using two shorts, so increased used again.
1624 else if (ch >= 0xD800 && ch <= 0xDFFF)
1626 // Between possible UTF-16 surrogates (invalid!)
1627 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1629 else array[pos++] = static_cast<uchar16_t>(ch);
1633 // Validate our new UTF-16 string.
1640 //! Appends a ustring16 to this ustring16
1641 //! \param other The string to append to this one.
1642 //! \return A reference to our current string.
1643 ustring16<TAlloc>& append(const ustring16<TAlloc>& other)
1645 const uchar16_t* oa = other.c_str();
1647 u32 len = other.size_raw();
1649 if (used + len >= allocated)
1650 reallocate(used + len);
1652 for (u32 l=0; l<len; ++l)
1653 array[used+l] = oa[l];
1662 //! Appends a certain amount of characters of a ustring16 to this ustring16.
1663 //! \param other The string to append to this one.
1664 //! \param length How many characters of the other string to add to this one.
1665 //! \return A reference to our current string.
1666 ustring16<TAlloc>& append(const ustring16<TAlloc>& other, u32 length)
1668 if (other.size() == 0)
1671 if (other.size() < length)
1677 if (used + length * 2 >= allocated)
1678 reallocate(used + length * 2);
1680 const_iterator iter(other, 0);
1682 while (!iter.atEnd() && l)
1684 uchar32_t c = *iter;
1694 //! Reserves some memory.
1695 //! \param count The amount of characters to reserve.
1696 void reserve(u32 count)
1698 if (count < allocated)
1705 //! Finds first occurrence of character.
1706 //! \param c The character to search for.
1707 //! \return Position where the character has been found, or -1 if not found.
1708 s32 findFirst(uchar32_t c) const
1710 const_iterator i(*this, 0);
1725 //! Finds first occurrence of a character of a list.
1726 //! \param c A list of characters to find. For example if the method should find the first occurrence of 'a' or 'b', this parameter should be "ab".
1727 //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1728 //! \return Position where one of the characters has been found, or -1 if not found.
1729 s32 findFirstChar(const uchar32_t* const c, u32 count=1) const
1734 const_iterator i(*this, 0);
1740 for (u32 j=0; j<count; ++j)
1751 //! Finds first position of a character not in a given list.
1752 //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1753 //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1754 //! \return Position where the character has been found, or -1 if not found.
1755 s32 findFirstCharNotInList(const uchar32_t* const c, u32 count=1) const
1760 const_iterator i(*this, 0);
1767 for (j=0; j<count; ++j)
1780 //! Finds last position of a character not in a given list.
1781 //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1782 //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1783 //! \return Position where the character has been found, or -1 if not found.
1784 s32 findLastCharNotInList(const uchar32_t* const c, u32 count=1) const
1789 const_iterator i(end());
1792 s32 pos = size() - 1;
1793 while (!i.atStart())
1797 for (j=0; j<count; ++j)
1810 //! Finds next occurrence of character.
1811 //! \param c The character to search for.
1812 //! \param startPos The position in the string to start searching.
1813 //! \return Position where the character has been found, or -1 if not found.
1814 s32 findNext(uchar32_t c, u32 startPos) const
1816 const_iterator i(*this, startPos);
1832 //! Finds last occurrence of character.
1833 //! \param c The character to search for.
1834 //! \param start The start position of the reverse search ( default = -1, on end ).
1835 //! \return Position where the character has been found, or -1 if not found.
1836 s32 findLast(uchar32_t c, s32 start = -1) const
1839 start = core::clamp ( start < 0 ? (s32)s : start, 0, (s32)s ) - 1;
1841 const_iterator i(*this, start);
1843 while (!i.atStart())
1855 //! Finds last occurrence of a character in a list.
1856 //! \param c A list of strings to find. For example if the method should find the last occurrence of 'a' or 'b', this parameter should be "ab".
1857 //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1858 //! \return Position where one of the characters has been found, or -1 if not found.
1859 s32 findLastChar(const uchar32_t* const c, u32 count=1) const
1864 const_iterator i(end());
1868 while (!i.atStart())
1871 for (u32 j=0; j<count; ++j)
1882 //! Finds another ustring16 in this ustring16.
1883 //! \param str The string to find.
1884 //! \param start The start position of the search.
1885 //! \return Positions where the ustring16 has been found, or -1 if not found.
1886 s32 find(const ustring16<TAlloc>& str, const u32 start = 0) const
1888 u32 my_size = size();
1889 u32 their_size = str.size();
1891 if (their_size == 0 || my_size - start < their_size)
1894 const_iterator i(*this, start);
1899 const_iterator i2(i);
1900 const_iterator j(str, 0);
1901 uchar32_t t1 = (uchar32_t)*i2;
1902 uchar32_t t2 = (uchar32_t)*j;
1909 t1 = (uchar32_t)*i2;
1920 //! Finds another ustring16 in this ustring16.
1921 //! \param str The string to find.
1922 //! \param start The start position of the search.
1923 //! \return Positions where the string has been found, or -1 if not found.
1924 s32 find_raw(const ustring16<TAlloc>& str, const u32 start = 0) const
1926 const uchar16_t* data = str.c_str();
1937 for (u32 i=start; i<=used-len; ++i)
1941 while(data[j] && array[i+j] == data[j])
1953 //! Returns a substring.
1954 //! \param begin: Start of substring.
1955 //! \param length: Length of substring.
1956 //! \return A reference to our current string.
1957 ustring16<TAlloc> subString(u32 begin, s32 length) const
1960 // if start after ustring16
1961 // or no proper substring length
1962 if ((length <= 0) || (begin>=len))
1963 return ustring16<TAlloc>("");
1964 // clamp length to maximal value
1965 if ((length+begin) > len)
1968 ustring16<TAlloc> o;
1969 o.reserve((length+1) * 2);
1971 const_iterator i(*this, begin);
1972 while (!i.atEnd() && length)
1983 //! Appends a character to this ustring16.
1984 //! \param c Character to append.
1985 //! \return A reference to our current string.
1986 ustring16<TAlloc>& operator += (char c)
1988 append((uchar32_t)c);
1993 //! Appends a character to this ustring16.
1994 //! \param c Character to append.
1995 //! \return A reference to our current string.
1996 ustring16<TAlloc>& operator += (uchar32_t c)
2003 //! Appends a number to this ustring16.
2004 //! \param c Number to append.
2005 //! \return A reference to our current string.
2006 ustring16<TAlloc>& operator += (short c)
2008 append(core::stringc(c));
2013 //! Appends a number to this ustring16.
2014 //! \param c Number to append.
2015 //! \return A reference to our current string.
2016 ustring16<TAlloc>& operator += (unsigned short c)
2018 append(core::stringc(c));
2023 //! Appends a number to this ustring16.
2024 //! \param c Number to append.
2025 //! \return A reference to our current string.
2026 ustring16<TAlloc>& operator += (int c)
2028 append(core::stringc(c));
2033 //! Appends a number to this ustring16.
2034 //! \param c Number to append.
2035 //! \return A reference to our current string.
2036 ustring16<TAlloc>& operator += (unsigned int c)
2038 append(core::stringc(c));
2043 //! Appends a number to this ustring16.
2044 //! \param c Number to append.
2045 //! \return A reference to our current string.
2046 ustring16<TAlloc>& operator += (long c)
2048 append(core::stringc(c));
2053 //! Appends a number to this ustring16.
2054 //! \param c Number to append.
2055 //! \return A reference to our current string.
2056 ustring16<TAlloc>& operator += (unsigned long c)
2058 append(core::stringc(c));
2063 //! Appends a number to this ustring16.
2064 //! \param c Number to append.
2065 //! \return A reference to our current string.
2066 ustring16<TAlloc>& operator += (double c)
2068 append(core::stringc(c));
2073 //! Appends a char ustring16 to this ustring16.
2074 //! \param c Char ustring16 to append.
2075 //! \return A reference to our current string.
2076 ustring16<TAlloc>& operator += (const uchar16_t* const c)
2083 //! Appends a ustring16 to this ustring16.
2084 //! \param other ustring16 to append.
2085 //! \return A reference to our current string.
2086 ustring16<TAlloc>& operator += (const ustring16<TAlloc>& other)
2093 //! Replaces all characters of a given type with another one.
2094 //! \param toReplace Character to replace.
2095 //! \param replaceWith Character replacing the old one.
2096 //! \return A reference to our current string.
2097 ustring16<TAlloc>& replace(uchar32_t toReplace, uchar32_t replaceWith)
2099 iterator i(*this, 0);
2102 typename ustring16<TAlloc>::access a = *i;
2103 if ((uchar32_t)a == toReplace)
2111 //! Replaces all instances of a string with another one.
2112 //! \param toReplace The string to replace.
2113 //! \param replaceWith The string replacing the old one.
2114 //! \return A reference to our current string.
2115 ustring16<TAlloc>& replace(const ustring16<TAlloc>& toReplace, const ustring16<TAlloc>& replaceWith)
2117 if (toReplace.size() == 0)
2120 const uchar16_t* other = toReplace.c_str();
2121 const uchar16_t* replace = replaceWith.c_str();
2122 const u32 other_size = toReplace.size_raw();
2123 const u32 replace_size = replaceWith.size_raw();
2125 // Determine the delta. The algorithm will change depending on the delta.
2126 s32 delta = replace_size - other_size;
2128 // A character for character replace. The string will not shrink or grow.
2132 while ((pos = find_raw(other, pos)) != -1)
2134 for (u32 i = 0; i < replace_size; ++i)
2135 array[pos + i] = replace[i];
2141 // We are going to be removing some characters. The string will shrink.
2145 for (u32 pos = 0; pos <= used; ++i, ++pos)
2147 // Is this potentially a match?
2148 if (array[pos] == *other)
2150 // Check to see if we have a match.
2152 for (j = 0; j < other_size; ++j)
2154 if (array[pos + j] != other[j])
2158 // If we have a match, replace characters.
2159 if (j == other_size)
2161 for (j = 0; j < replace_size; ++j)
2162 array[i + j] = replace[j];
2163 i += replace_size - 1;
2164 pos += other_size - 1;
2169 // No match found, just copy characters.
2170 array[i - 1] = array[pos];
2178 // We are going to be adding characters, so the string size will increase.
2179 // Count the number of times toReplace exists in the string so we can allocate the new size.
2182 while ((pos = find_raw(other, pos)) != -1)
2188 // Re-allocate the string now, if needed.
2189 u32 len = delta * find_count;
2190 if (used + len >= allocated)
2191 reallocate(used + len);
2195 while ((pos = find_raw(other, pos)) != -1)
2197 uchar16_t* start = array + pos + other_size - 1;
2198 uchar16_t* ptr = array + used;
2199 uchar16_t* end = array + used + delta;
2201 // Shift characters to make room for the string.
2202 while (ptr != start)
2209 // Add the new string now.
2210 for (u32 i = 0; i < replace_size; ++i)
2211 array[pos + i] = replace[i];
2213 pos += replace_size;
2217 // Terminate the string and return ourself.
2223 //! Removes characters from a ustring16..
2224 //! \param c The character to remove.
2225 //! \return A reference to our current string.
2226 ustring16<TAlloc>& remove(uchar32_t c)
2230 u32 len = (c > 0xFFFF ? 2 : 1); // Remove characters equal to the size of c as a UTF-16 character.
2231 for (u32 i=0; i<=used; ++i)
2234 if (!UTF16_IS_SURROGATE_HI(array[i]))
2236 else if (i + 1 <= used)
2238 // Convert the surrogate pair into a single UTF-32 character.
2239 uc32 = unicode::toUTF32(array[i], array[i + 1]);
2241 u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2249 array[pos++] = array[i];
2251 array[pos++] = array[++i];
2259 //! Removes a ustring16 from the ustring16.
2260 //! \param toRemove The string to remove.
2261 //! \return A reference to our current string.
2262 ustring16<TAlloc>& remove(const ustring16<TAlloc>& toRemove)
2264 u32 size = toRemove.size_raw();
2265 if (size == 0) return *this;
2267 const uchar16_t* tra = toRemove.c_str();
2270 for (u32 i=0; i<=used; ++i)
2275 if (array[i + j] != tra[j])
2286 array[pos++] = array[i];
2294 //! Removes characters from the ustring16.
2295 //! \param characters The characters to remove.
2296 //! \return A reference to our current string.
2297 ustring16<TAlloc>& removeChars(const ustring16<TAlloc>& characters)
2299 if (characters.size_raw() == 0)
2304 const_iterator iter(characters);
2305 for (u32 i=0; i<=used; ++i)
2308 if (!UTF16_IS_SURROGATE_HI(array[i]))
2310 else if (i + 1 <= used)
2312 // Convert the surrogate pair into a single UTF-32 character.
2313 uc32 = unicode::toUTF32(array[i], array[i+1]);
2315 u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2319 while (!iter.atEnd())
2321 uchar32_t c = *iter;
2324 found += (c > 0xFFFF ? 2 : 1); // Remove characters equal to the size of c as a UTF-16 character.
2333 array[pos++] = array[i];
2335 array[pos++] = array[++i];
2343 //! Trims the ustring16.
2344 //! Removes the specified characters (by default, Latin-1 whitespace) from the begining and the end of the ustring16.
2345 //! \param whitespace The characters that are to be considered as whitespace.
2346 //! \return A reference to our current string.
2347 ustring16<TAlloc>& trim(const ustring16<TAlloc>& whitespace = " \t\n\r")
2349 core::array<uchar32_t> utf32white = whitespace.toUTF32();
2351 // find start and end of the substring without the specified characters
2352 const s32 begin = findFirstCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2356 const s32 end = findLastCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2358 return (*this = subString(begin, (end +1) - begin));
2362 //! Erases a character from the ustring16.
2363 //! May be slow, because all elements following after the erased element have to be copied.
2364 //! \param index Index of element to be erased.
2365 //! \return A reference to our current string.
2366 ustring16<TAlloc>& erase(u32 index)
2368 _IRR_DEBUG_BREAK_IF(index>used) // access violation
2370 iterator i(*this, index);
2373 u32 len = (t > 0xFFFF ? 2 : 1);
2375 for (u32 j = static_cast<u32>(i.getPos()) + len; j <= used; ++j)
2376 array[j - len] = array[j];
2385 //! Validate the existing ustring16, checking for valid surrogate pairs and checking for proper termination.
2386 //! \return A reference to our current string.
2387 ustring16<TAlloc>& validate()
2389 // Validate all unicode characters.
2390 for (u32 i=0; i<allocated; ++i)
2392 // Terminate on existing null.
2398 if (UTF16_IS_SURROGATE(array[i]))
2400 if (((i+1) >= allocated) || UTF16_IS_SURROGATE_LO(array[i]))
2401 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2402 else if (UTF16_IS_SURROGATE_HI(array[i]) && !UTF16_IS_SURROGATE_LO(array[i+1]))
2403 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2406 if (array[i] >= 0xFDD0 && array[i] <= 0xFDEF)
2407 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2414 used = allocated - 1;
2421 //! Gets the last char of the ustring16, or 0.
2422 //! \return The last char of the ustring16, or 0.
2423 uchar32_t lastChar() const
2428 if (UTF16_IS_SURROGATE_LO(array[used-1]))
2430 // Make sure we have a paired surrogate.
2434 // Check for an invalid surrogate.
2435 if (!UTF16_IS_SURROGATE_HI(array[used-2]))
2438 // Convert the surrogate pair into a single UTF-32 character.
2439 return unicode::toUTF32(array[used-2], array[used-1]);
2443 return array[used-1];
2448 //! Split the ustring16 into parts.
2449 /** This method will split a ustring16 at certain delimiter characters
2450 into the container passed in as reference. The type of the container
2451 has to be given as template parameter. It must provide a push_back and
2453 \param ret The result container
2454 \param c C-style ustring16 of delimiter characters
2455 \param count Number of delimiter characters
2456 \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2457 container. If two delimiters occur without a character in between, an
2458 empty substring would be placed in the result. If this flag is set,
2459 only non-empty strings are stored.
2460 \param keepSeparators Flag which allows to add the separator to the
2461 result ustring16. If this flag is true, the concatenation of the
2462 substrings results in the original ustring16. Otherwise, only the
2463 characters between the delimiters are returned.
2464 \return The number of resulting substrings
2466 template<class container>
2467 u32 split(container& ret, const uchar32_t* const c, u32 count=1, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2472 const_iterator i(*this);
2473 const u32 oldSize=ret.size();
2477 bool lastWasSeparator = false;
2481 bool foundSeparator = false;
2482 for (u32 j=0; j<count; ++j)
2486 if ((!ignoreEmptyTokens || pos - lastpos != 0) &&
2488 ret.push_back(ustring16<TAlloc>(&array[lastpospos], pos - lastpos));
2489 foundSeparator = true;
2490 lastpos = (keepSeparators ? pos : pos + 1);
2491 lastpospos = (keepSeparators ? i.getPos() : i.getPos() + 1);
2495 lastWasSeparator = foundSeparator;
2501 ret.push_back(ustring16<TAlloc>(&array[lastpospos], s - lastpos));
2502 return ret.size()-oldSize;
2506 //! Split the ustring16 into parts.
2507 /** This method will split a ustring16 at certain delimiter characters
2508 into the container passed in as reference. The type of the container
2509 has to be given as template parameter. It must provide a push_back and
2511 \param ret The result container
2512 \param c A unicode string of delimiter characters
2513 \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2514 container. If two delimiters occur without a character in between, an
2515 empty substring would be placed in the result. If this flag is set,
2516 only non-empty strings are stored.
2517 \param keepSeparators Flag which allows to add the separator to the
2518 result ustring16. If this flag is true, the concatenation of the
2519 substrings results in the original ustring16. Otherwise, only the
2520 characters between the delimiters are returned.
2521 \return The number of resulting substrings
2523 template<class container>
2524 u32 split(container& ret, const ustring16<TAlloc>& c, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2526 core::array<uchar32_t> v = c.toUTF32();
2527 return split(ret, v.pointer(), v.size(), ignoreEmptyTokens, keepSeparators);
2531 //! Gets the size of the allocated memory buffer for the string.
2532 //! \return The size of the allocated memory buffer.
2533 u32 capacity() const
2539 //! Returns the raw number of UTF-16 code points in the string which includes the individual surrogates.
2540 //! \return The raw number of UTF-16 code points, excluding the trialing NUL.
2541 u32 size_raw() const
2547 //! Inserts a character into the string.
2548 //! \param c The character to insert.
2549 //! \param pos The position to insert the character.
2550 //! \return A reference to our current string.
2551 ustring16<TAlloc>& insert(uchar32_t c, u32 pos)
2553 u8 len = (c > 0xFFFF ? 2 : 1);
2555 if (used + len >= allocated)
2556 reallocate(used + len);
2560 iterator iter(*this, pos);
2561 for (u32 i = used - 2; i > iter.getPos(); --i)
2562 array[i] = array[i - len];
2566 // c will be multibyte, so split it up into a surrogate pair.
2567 uchar16_t x = static_cast<uchar16_t>(c);
2568 uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
2569 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
2570 array[iter.getPos()] = vh;
2571 array[iter.getPos()+1] = vl;
2575 array[iter.getPos()] = static_cast<uchar16_t>(c);
2582 //! Inserts a string into the string.
2583 //! \param c The string to insert.
2584 //! \param pos The position to insert the string.
2585 //! \return A reference to our current string.
2586 ustring16<TAlloc>& insert(const ustring16<TAlloc>& c, u32 pos)
2588 u32 len = c.size_raw();
2589 if (len == 0) return *this;
2591 if (used + len >= allocated)
2592 reallocate(used + len);
2596 iterator iter(*this, pos);
2597 for (u32 i = used - 2; i > iter.getPos() + len; --i)
2598 array[i] = array[i - len];
2600 const uchar16_t* s = c.c_str();
2601 for (u32 i = 0; i < len; ++i)
2612 //! Inserts a character into the string.
2613 //! \param c The character to insert.
2614 //! \param pos The position to insert the character.
2615 //! \return A reference to our current string.
2616 ustring16<TAlloc>& insert_raw(uchar16_t c, u32 pos)
2618 if (used + 1 >= allocated)
2619 reallocate(used + 1);
2623 for (u32 i = used - 1; i > pos; --i)
2624 array[i] = array[i - 1];
2632 //! Removes a character from string.
2633 //! \param pos Position of the character to remove.
2634 //! \return A reference to our current string.
2635 ustring16<TAlloc>& erase_raw(u32 pos)
2637 for (u32 i=pos; i<=used; ++i)
2639 array[i] = array[i + 1];
2647 //! Replaces a character in the string.
2648 //! \param c The new character.
2649 //! \param pos The position of the character to replace.
2650 //! \return A reference to our current string.
2651 ustring16<TAlloc>& replace_raw(uchar16_t c, u32 pos)
2658 //! Returns an iterator to the beginning of the string.
2659 //! \return An iterator to the beginning of the string.
2662 iterator i(*this, 0);
2667 //! Returns an iterator to the beginning of the string.
2668 //! \return An iterator to the beginning of the string.
2669 const_iterator begin() const
2671 const_iterator i(*this, 0);
2676 //! Returns an iterator to the beginning of the string.
2677 //! \return An iterator to the beginning of the string.
2678 const_iterator cbegin() const
2680 const_iterator i(*this, 0);
2685 //! Returns an iterator to the end of the string.
2686 //! \return An iterator to the end of the string.
2689 iterator i(*this, 0);
2695 //! Returns an iterator to the end of the string.
2696 //! \return An iterator to the end of the string.
2697 const_iterator end() const
2699 const_iterator i(*this, 0);
2705 //! Returns an iterator to the end of the string.
2706 //! \return An iterator to the end of the string.
2707 const_iterator cend() const
2709 const_iterator i(*this, 0);
2715 //! Converts the string to a UTF-8 encoded string.
2716 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2717 //! \return A string containing the UTF-8 encoded string.
2718 core::string<uchar8_t> toUTF8_s(const bool addBOM = false) const
2720 core::string<uchar8_t> ret;
2721 ret.reserve(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2722 const_iterator iter(*this, 0);
2724 // Add the byte order mark if the user wants it.
2727 ret.append(unicode::BOM_ENCODE_UTF8[0]);
2728 ret.append(unicode::BOM_ENCODE_UTF8[1]);
2729 ret.append(unicode::BOM_ENCODE_UTF8[2]);
2732 while (!iter.atEnd())
2734 uchar32_t c = *iter;
2737 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2738 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2739 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2740 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2748 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2749 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2750 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2757 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2758 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2764 ret.append(static_cast<uchar8_t>(c));
2772 //! Converts the string to a UTF-8 encoded string array.
2773 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2774 //! \return An array containing the UTF-8 encoded string.
2775 core::array<uchar8_t> toUTF8(const bool addBOM = false) const
2777 core::array<uchar8_t> ret(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2778 const_iterator iter(*this, 0);
2780 // Add the byte order mark if the user wants it.
2783 ret.push_back(unicode::BOM_ENCODE_UTF8[0]);
2784 ret.push_back(unicode::BOM_ENCODE_UTF8[1]);
2785 ret.push_back(unicode::BOM_ENCODE_UTF8[2]);
2788 while (!iter.atEnd())
2790 uchar32_t c = *iter;
2793 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2794 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2795 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2796 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2804 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2805 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2806 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2813 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2814 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2820 ret.push_back(static_cast<uchar8_t>(c));
2829 //! Converts the string to a UTF-16 encoded string array.
2830 //! Unfortunately, no toUTF16_s() version exists due to limitations with Irrlicht's string class.
2831 //! \param endian The desired endianness of the string.
2832 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2833 //! \return An array containing the UTF-16 encoded string.
2834 core::array<uchar16_t> toUTF16(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2836 core::array<uchar16_t> ret(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2837 uchar16_t* ptr = ret.pointer();
2839 // Add the BOM if specified.
2842 if (endian == unicode::EUTFEE_NATIVE)
2843 *ptr = unicode::BOM;
2844 else if (endian == unicode::EUTFEE_LITTLE)
2846 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2847 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2848 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2852 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2853 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2854 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2859 memcpy((void*)ptr, (void*)array, used * sizeof(uchar16_t));
2860 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2862 for (u32 i = 0; i <= used; ++i)
2863 ptr[i] = unicode::swapEndian16(ptr[i]);
2865 ret.set_used(used + (addBOM ? unicode::BOM_UTF16_LEN : 0));
2871 //! Converts the string to a UTF-32 encoded string array.
2872 //! Unfortunately, no toUTF32_s() version exists due to limitations with Irrlicht's string class.
2873 //! \param endian The desired endianness of the string.
2874 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2875 //! \return An array containing the UTF-32 encoded string.
2876 core::array<uchar32_t> toUTF32(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2878 core::array<uchar32_t> ret(size() + (addBOM ? unicode::BOM_UTF32_LEN : 0) + 1);
2879 const_iterator iter(*this, 0);
2881 // Add the BOM if specified.
2884 if (endian == unicode::EUTFEE_NATIVE)
2885 ret.push_back(unicode::BOM);
2894 if (endian == unicode::EUTFEE_LITTLE)
2896 t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
2897 t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
2898 t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
2899 t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
2903 t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
2904 t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
2905 t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
2906 t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
2908 ret.push_back(t.full);
2913 while (!iter.atEnd())
2915 uchar32_t c = *iter;
2916 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2917 c = unicode::swapEndian32(c);
2925 //! Converts the string to a wchar_t encoded string.
2926 /** The size of a wchar_t changes depending on the platform. This function will store a
2927 correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
2928 //! \param endian The desired endianness of the string.
2929 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2930 //! \return A string containing the wchar_t encoded string.
2931 core::string<wchar_t> toWCHAR_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2933 if (sizeof(wchar_t) == 4)
2935 core::array<uchar32_t> a(toUTF32(endian, addBOM));
2936 core::stringw ret(a.pointer());
2939 else if (sizeof(wchar_t) == 2)
2941 if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
2943 core::stringw ret(array);
2948 core::array<uchar16_t> a(toUTF16(endian, addBOM));
2949 core::stringw ret(a.pointer());
2953 else if (sizeof(wchar_t) == 1)
2955 core::array<uchar8_t> a(toUTF8(addBOM));
2956 core::stringw ret(a.pointer());
2960 // Shouldn't happen.
2961 return core::stringw();
2965 //! Converts the string to a wchar_t encoded string array.
2966 /** The size of a wchar_t changes depending on the platform. This function will store a
2967 correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
2968 //! \param endian The desired endianness of the string.
2969 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2970 //! \return An array containing the wchar_t encoded string.
2971 core::array<wchar_t> toWCHAR(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2973 if (sizeof(wchar_t) == 4)
2975 core::array<uchar32_t> a(toUTF32(endian, addBOM));
2976 core::array<wchar_t> ret(a.size());
2977 ret.set_used(a.size());
2978 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar32_t));
2981 if (sizeof(wchar_t) == 2)
2983 if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
2985 core::array<wchar_t> ret(used);
2987 memcpy((void*)ret.pointer(), (void*)array, used * sizeof(uchar16_t));
2992 core::array<uchar16_t> a(toUTF16(endian, addBOM));
2993 core::array<wchar_t> ret(a.size());
2994 ret.set_used(a.size());
2995 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar16_t));
2999 if (sizeof(wchar_t) == 1)
3001 core::array<uchar8_t> a(toUTF8(addBOM));
3002 core::array<wchar_t> ret(a.size());
3003 ret.set_used(a.size());
3004 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar8_t));
3008 // Shouldn't happen.
3009 return core::array<wchar_t>();
3012 //! Converts the string to a properly encoded io::path string.
3013 //! \param endian The desired endianness of the string.
3014 //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3015 //! \return An io::path string containing the properly encoded string.
3016 io::path toPATH_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3018 return toUTF8_s(addBOM);
3021 //! Loads an unknown stream of data.
3022 //! Will attempt to determine if the stream is unicode data. Useful for loading from files.
3023 //! \param data The data stream to load from.
3024 //! \param data_size The length of the data string.
3025 //! \return A reference to our current string.
3026 ustring16<TAlloc>& loadDataStream(const char* data, size_t data_size)
3028 // Clear our string.
3033 unicode::EUTF_ENCODE e = unicode::determineUnicodeBOM(data);
3037 case unicode::EUTFE_UTF8:
3038 append((uchar8_t*)data, data_size);
3041 case unicode::EUTFE_UTF16:
3042 case unicode::EUTFE_UTF16_BE:
3043 case unicode::EUTFE_UTF16_LE:
3044 append((uchar16_t*)data, data_size / 2);
3047 case unicode::EUTFE_UTF32:
3048 case unicode::EUTFE_UTF32_BE:
3049 case unicode::EUTFE_UTF32_LE:
3050 append((uchar32_t*)data, data_size / 4);
3057 //! Gets the encoding of the Unicode string this class contains.
3058 //! \return An enum describing the current encoding of this string.
3059 const unicode::EUTF_ENCODE getEncoding() const
3064 //! Gets the endianness of the Unicode string this class contains.
3065 //! \return An enum describing the endianness of this string.
3066 const unicode::EUTF_ENDIAN getEndianness() const
3068 if (encoding == unicode::EUTFE_UTF16_LE ||
3069 encoding == unicode::EUTFE_UTF32_LE)
3070 return unicode::EUTFEE_LITTLE;
3071 else return unicode::EUTFEE_BIG;
3076 //! Reallocate the string, making it bigger or smaller.
3077 //! \param new_size The new size of the string.
3078 void reallocate(u32 new_size)
3080 uchar16_t* old_array = array;
3082 array = allocator.allocate(new_size + 1); //new u16[new_size];
3083 allocated = new_size + 1;
3084 if (old_array == 0) return;
3086 u32 amount = used < new_size ? used : new_size;
3087 for (u32 i=0; i<=amount; ++i)
3088 array[i] = old_array[i];
3090 if (allocated <= used)
3091 used = allocated - 1;
3095 allocator.deallocate(old_array); // delete [] old_array;
3098 //--- member variables
3101 unicode::EUTF_ENCODE encoding;
3105 //irrAllocator<uchar16_t> allocator;
3108 typedef ustring16<irrAllocator<uchar16_t> > ustring;
3111 //! Appends two ustring16s.
3112 template <typename TAlloc>
3113 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const ustring16<TAlloc>& right)
3115 ustring16<TAlloc> ret(left);
3121 //! Appends a ustring16 and a null-terminated unicode string.
3122 template <typename TAlloc, class B>
3123 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const B* const right)
3125 ustring16<TAlloc> ret(left);
3131 //! Appends a ustring16 and a null-terminated unicode string.
3132 template <class B, typename TAlloc>
3133 inline ustring16<TAlloc> operator+(const B* const left, const ustring16<TAlloc>& right)
3135 ustring16<TAlloc> ret(left);
3141 //! Appends a ustring16 and an Irrlicht string.
3142 template <typename TAlloc, typename B>
3143 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const string<B>& right)
3145 ustring16<TAlloc> ret(left);
3151 //! Appends a ustring16 and an Irrlicht string.
3152 template <typename TAlloc, typename B>
3153 inline ustring16<TAlloc> operator+(const string<B>& left, const ustring16<TAlloc>& right)
3155 ustring16<TAlloc> ret(left);
3161 //! Appends a ustring16 and a std::basic_string.
3162 template <typename TAlloc, typename B, typename A, typename BAlloc>
3163 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const std::basic_string<B, A, BAlloc>& right)
3165 ustring16<TAlloc> ret(left);
3171 //! Appends a ustring16 and a std::basic_string.
3172 template <typename TAlloc, typename B, typename A, typename BAlloc>
3173 inline ustring16<TAlloc> operator+(const std::basic_string<B, A, BAlloc>& left, const ustring16<TAlloc>& right)
3175 ustring16<TAlloc> ret(left);
3181 //! Appends a ustring16 and a char.
3182 template <typename TAlloc>
3183 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const char right)
3185 ustring16<TAlloc> ret(left);
3191 //! Appends a ustring16 and a char.
3192 template <typename TAlloc>
3193 inline ustring16<TAlloc> operator+(const char left, const ustring16<TAlloc>& right)
3195 ustring16<TAlloc> ret(left);
3201 //! Appends a ustring16 and a uchar32_t.
3202 template <typename TAlloc>
3203 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const uchar32_t right)
3205 ustring16<TAlloc> ret(left);
3211 //! Appends a ustring16 and a uchar32_t.
3212 template <typename TAlloc>
3213 inline ustring16<TAlloc> operator+(const uchar32_t left, const ustring16<TAlloc>& right)
3215 ustring16<TAlloc> ret(left);
3221 //! Appends a ustring16 and a short.
3222 template <typename TAlloc>
3223 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const short right)
3225 ustring16<TAlloc> ret(left);
3226 ret += core::stringc(right);
3231 //! Appends a ustring16 and a short.
3232 template <typename TAlloc>
3233 inline ustring16<TAlloc> operator+(const short left, const ustring16<TAlloc>& right)
3235 ustring16<TAlloc> ret((core::stringc(left)));
3241 //! Appends a ustring16 and an unsigned short.
3242 template <typename TAlloc>
3243 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned short right)
3245 ustring16<TAlloc> ret(left);
3246 ret += core::stringc(right);
3251 //! Appends a ustring16 and an unsigned short.
3252 template <typename TAlloc>
3253 inline ustring16<TAlloc> operator+(const unsigned short left, const ustring16<TAlloc>& right)
3255 ustring16<TAlloc> ret((core::stringc(left)));
3261 //! Appends a ustring16 and an int.
3262 template <typename TAlloc>
3263 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const int right)
3265 ustring16<TAlloc> ret(left);
3266 ret += core::stringc(right);
3271 //! Appends a ustring16 and an int.
3272 template <typename TAlloc>
3273 inline ustring16<TAlloc> operator+(const int left, const ustring16<TAlloc>& right)
3275 ustring16<TAlloc> ret((core::stringc(left)));
3281 //! Appends a ustring16 and an unsigned int.
3282 template <typename TAlloc>
3283 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned int right)
3285 ustring16<TAlloc> ret(left);
3286 ret += core::stringc(right);
3291 //! Appends a ustring16 and an unsigned int.
3292 template <typename TAlloc>
3293 inline ustring16<TAlloc> operator+(const unsigned int left, const ustring16<TAlloc>& right)
3295 ustring16<TAlloc> ret((core::stringc(left)));
3301 //! Appends a ustring16 and a long.
3302 template <typename TAlloc>
3303 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const long right)
3305 ustring16<TAlloc> ret(left);
3306 ret += core::stringc(right);
3311 //! Appends a ustring16 and a long.
3312 template <typename TAlloc>
3313 inline ustring16<TAlloc> operator+(const long left, const ustring16<TAlloc>& right)
3315 ustring16<TAlloc> ret((core::stringc(left)));
3321 //! Appends a ustring16 and an unsigned long.
3322 template <typename TAlloc>
3323 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned long right)
3325 ustring16<TAlloc> ret(left);
3326 ret += core::stringc(right);
3331 //! Appends a ustring16 and an unsigned long.
3332 template <typename TAlloc>
3333 inline ustring16<TAlloc> operator+(const unsigned long left, const ustring16<TAlloc>& right)
3335 ustring16<TAlloc> ret((core::stringc(left)));
3341 //! Appends a ustring16 and a float.
3342 template <typename TAlloc>
3343 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const float right)
3345 ustring16<TAlloc> ret(left);
3346 ret += core::stringc(right);
3351 //! Appends a ustring16 and a float.
3352 template <typename TAlloc>
3353 inline ustring16<TAlloc> operator+(const float left, const ustring16<TAlloc>& right)
3355 ustring16<TAlloc> ret((core::stringc(left)));
3361 //! Appends a ustring16 and a double.
3362 template <typename TAlloc>
3363 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const double right)
3365 ustring16<TAlloc> ret(left);
3366 ret += core::stringc(right);
3371 //! Appends a ustring16 and a double.
3372 template <typename TAlloc>
3373 inline ustring16<TAlloc> operator+(const double left, const ustring16<TAlloc>& right)
3375 ustring16<TAlloc> ret((core::stringc(left)));
3381 //! Appends two ustring16s.
3382 template <typename TAlloc>
3383 inline ustring16<TAlloc>&& operator+(const ustring16<TAlloc>& left, ustring16<TAlloc>&& right)
3385 //std::cout << "MOVE operator+(&, &&)" << std::endl;
3386 right.insert(left, 0);
3387 return std::move(right);
3391 //! Appends two ustring16s.
3392 template <typename TAlloc>
3393 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const ustring16<TAlloc>& right)
3395 //std::cout << "MOVE operator+(&&, &)" << std::endl;
3397 return std::move(left);
3401 //! Appends two ustring16s.
3402 template <typename TAlloc>
3403 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, ustring16<TAlloc>&& right)
3405 //std::cout << "MOVE operator+(&&, &&)" << std::endl;
3406 if ((right.size_raw() <= left.capacity() - left.size_raw()) ||
3407 (right.capacity() - right.size_raw() < left.size_raw()))
3410 return std::move(left);
3414 right.insert(left, 0);
3415 return std::move(right);
3420 //! Appends a ustring16 and a null-terminated unicode string.
3421 template <typename TAlloc, class B>
3422 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const B* const right)
3424 //std::cout << "MOVE operator+(&&, B*)" << std::endl;
3426 return std::move(left);
3430 //! Appends a ustring16 and a null-terminated unicode string.
3431 template <class B, typename TAlloc>
3432 inline ustring16<TAlloc>&& operator+(const B* const left, ustring16<TAlloc>&& right)
3434 //std::cout << "MOVE operator+(B*, &&)" << std::endl;
3435 right.insert(left, 0);
3436 return std::move(right);
3440 //! Appends a ustring16 and an Irrlicht string.
3441 template <typename TAlloc, typename B>
3442 inline ustring16<TAlloc>&& operator+(const string<B>& left, ustring16<TAlloc>&& right)
3444 //std::cout << "MOVE operator+(&, &&)" << std::endl;
3445 right.insert(left, 0);
3446 return std::move(right);
3450 //! Appends a ustring16 and an Irrlicht string.
3451 template <typename TAlloc, typename B>
3452 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const string<B>& right)
3454 //std::cout << "MOVE operator+(&&, &)" << std::endl;
3456 return std::move(left);
3460 //! Appends a ustring16 and a std::basic_string.
3461 template <typename TAlloc, typename B, typename A, typename BAlloc>
3462 inline ustring16<TAlloc>&& operator+(const std::basic_string<B, A, BAlloc>& left, ustring16<TAlloc>&& right)
3464 //std::cout << "MOVE operator+(&, &&)" << std::endl;
3465 right.insert(core::ustring16<TAlloc>(left), 0);
3466 return std::move(right);
3470 //! Appends a ustring16 and a std::basic_string.
3471 template <typename TAlloc, typename B, typename A, typename BAlloc>
3472 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const std::basic_string<B, A, BAlloc>& right)
3474 //std::cout << "MOVE operator+(&&, &)" << std::endl;
3476 return std::move(left);
3480 //! Appends a ustring16 and a char.
3481 template <typename TAlloc>
3482 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const char right)
3484 left.append((uchar32_t)right);
3485 return std::move(left);
3489 //! Appends a ustring16 and a char.
3490 template <typename TAlloc>
3491 inline ustring16<TAlloc> operator+(const char left, ustring16<TAlloc>&& right)
3493 right.insert((uchar32_t)left, 0);
3494 return std::move(right);
3498 //! Appends a ustring16 and a uchar32_t.
3499 template <typename TAlloc>
3500 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const uchar32_t right)
3503 return std::move(left);
3507 //! Appends a ustring16 and a uchar32_t.
3508 template <typename TAlloc>
3509 inline ustring16<TAlloc> operator+(const uchar32_t left, ustring16<TAlloc>&& right)
3511 right.insert(left, 0);
3512 return std::move(right);
3516 //! Appends a ustring16 and a short.
3517 template <typename TAlloc>
3518 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const short right)
3520 left.append(core::stringc(right));
3521 return std::move(left);
3525 //! Appends a ustring16 and a short.
3526 template <typename TAlloc>
3527 inline ustring16<TAlloc> operator+(const short left, ustring16<TAlloc>&& right)
3529 right.insert(core::stringc(left), 0);
3530 return std::move(right);
3534 //! Appends a ustring16 and an unsigned short.
3535 template <typename TAlloc>
3536 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned short right)
3538 left.append(core::stringc(right));
3539 return std::move(left);
3543 //! Appends a ustring16 and an unsigned short.
3544 template <typename TAlloc>
3545 inline ustring16<TAlloc> operator+(const unsigned short left, ustring16<TAlloc>&& right)
3547 right.insert(core::stringc(left), 0);
3548 return std::move(right);
3552 //! Appends a ustring16 and an int.
3553 template <typename TAlloc>
3554 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const int right)
3556 left.append(core::stringc(right));
3557 return std::move(left);
3561 //! Appends a ustring16 and an int.
3562 template <typename TAlloc>
3563 inline ustring16<TAlloc> operator+(const int left, ustring16<TAlloc>&& right)
3565 right.insert(core::stringc(left), 0);
3566 return std::move(right);
3570 //! Appends a ustring16 and an unsigned int.
3571 template <typename TAlloc>
3572 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned int right)
3574 left.append(core::stringc(right));
3575 return std::move(left);
3579 //! Appends a ustring16 and an unsigned int.
3580 template <typename TAlloc>
3581 inline ustring16<TAlloc> operator+(const unsigned int left, ustring16<TAlloc>&& right)
3583 right.insert(core::stringc(left), 0);
3584 return std::move(right);
3588 //! Appends a ustring16 and a long.
3589 template <typename TAlloc>
3590 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const long right)
3592 left.append(core::stringc(right));
3593 return std::move(left);
3597 //! Appends a ustring16 and a long.
3598 template <typename TAlloc>
3599 inline ustring16<TAlloc> operator+(const long left, ustring16<TAlloc>&& right)
3601 right.insert(core::stringc(left), 0);
3602 return std::move(right);
3606 //! Appends a ustring16 and an unsigned long.
3607 template <typename TAlloc>
3608 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned long right)
3610 left.append(core::stringc(right));
3611 return std::move(left);
3615 //! Appends a ustring16 and an unsigned long.
3616 template <typename TAlloc>
3617 inline ustring16<TAlloc> operator+(const unsigned long left, ustring16<TAlloc>&& right)
3619 right.insert(core::stringc(left), 0);
3620 return std::move(right);
3624 //! Appends a ustring16 and a float.
3625 template <typename TAlloc>
3626 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const float right)
3628 left.append(core::stringc(right));
3629 return std::move(left);
3633 //! Appends a ustring16 and a float.
3634 template <typename TAlloc>
3635 inline ustring16<TAlloc> operator+(const float left, ustring16<TAlloc>&& right)
3637 right.insert(core::stringc(left), 0);
3638 return std::move(right);
3642 //! Appends a ustring16 and a double.
3643 template <typename TAlloc>
3644 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const double right)
3646 left.append(core::stringc(right));
3647 return std::move(left);
3651 //! Appends a ustring16 and a double.
3652 template <typename TAlloc>
3653 inline ustring16<TAlloc> operator+(const double left, ustring16<TAlloc>&& right)
3655 right.insert(core::stringc(left), 0);
3656 return std::move(right);
3660 //! Writes a ustring16 to an ostream.
3661 template <typename TAlloc>
3662 inline std::ostream& operator<<(std::ostream& out, const ustring16<TAlloc>& in)
3664 out << in.toUTF8_s().c_str();
3668 //! Writes a ustring16 to a wostream.
3669 template <typename TAlloc>
3670 inline std::wostream& operator<<(std::wostream& out, const ustring16<TAlloc>& in)
3672 out << in.toWCHAR_s().c_str();
3679 //! Hashing algorithm for hashing a ustring. Used for things like unordered_maps.
3680 //! Algorithm taken from std::hash<std::string>.
3681 class hash : public std::unary_function<core::ustring, size_t>
3684 size_t operator()(const core::ustring& s) const
3686 size_t ret = 2166136261U;
3688 size_t stride = 1 + s.size_raw() / 10;
3690 core::ustring::const_iterator i = s.begin();
3691 while (i != s.end())
3693 // TODO: Don't force u32 on an x64 OS. Make it agnostic.
3694 ret = 16777619U * ret ^ (size_t)s[(u32)index];
3702 } // end namespace unicode
3704 } // end namespace core
3705 } // end namespace irr