]> git.lizzy.rs Git - irrlicht.git/blob - include/irrUString.h
Merge branch 'master' into opengl3
[irrlicht.git] / include / irrUString.h
1 /*
2    Basic Unicode string class for Irrlicht.
3    Copyright (c) 2009-2011 John Norman
4
5    This software is provided 'as-is', without any express or implied
6    warranty. In no event will the authors be held liable for any
7    damages arising from the use of this software.
8
9    Permission is granted to anyone to use this software for any
10    purpose, including commercial applications, and to alter it and
11    redistribute it freely, subject to the following restrictions:
12
13    1. The origin of this software must not be misrepresented; you
14       must not claim that you wrote the original software. If you use
15       this software in a product, an acknowledgment in the product
16       documentation would be appreciated but is not required.
17
18    2. Altered source versions must be plainly marked as such, and
19       must not be misrepresented as being the original software.
20
21    3. This notice may not be removed or altered from any source
22       distribution.
23
24    The original version of this class can be located at:
25    http://irrlicht.suckerfreegames.com/
26
27    John Norman
28    john@suckerfreegames.com
29 */
30
31 #pragma once
32
33 #include <stdio.h>
34 #include <string.h>
35 #include <stdlib.h>
36 #include <cstddef>
37
38 #ifdef _WIN32
39 #define __BYTE_ORDER 0
40 #define __LITTLE_ENDIAN 0
41 #define __BIG_ENDIAN 1
42 #elif defined(__MACH__) && defined(__APPLE__)
43 #include <machine/endian.h>
44 #elif defined(__FreeBSD__) || defined(__DragonFly__)
45 #include <sys/endian.h>
46 #else
47 #include <endian.h>
48 #endif
49
50 #include <utility>
51
52 #include <string>
53 #include <iterator>
54 #include <ostream>
55
56 #include "irrTypes.h"
57 #include "irrArray.h"
58 #include "irrMath.h"
59 #include "irrString.h"
60 #include "path.h"
61
62 //! UTF-16 surrogate start values.
63 static const irr::u16 UTF16_HI_SURROGATE = 0xD800;
64 static const irr::u16 UTF16_LO_SURROGATE = 0xDC00;
65
66 //! Is a UTF-16 code point a surrogate?
67 #define UTF16_IS_SURROGATE(c)           (((c) & 0xF800) == 0xD800)
68 #define UTF16_IS_SURROGATE_HI(c)        (((c) & 0xFC00) == 0xD800)
69 #define UTF16_IS_SURROGATE_LO(c)        (((c) & 0xFC00) == 0xDC00)
70
71
72 namespace irr
73 {
74
75         // Define our character types.
76         typedef char32_t uchar32_t;
77         typedef char16_t uchar16_t;
78         typedef char uchar8_t;
79
80 namespace core
81 {
82
83 namespace unicode
84 {
85
86 //! The unicode replacement character.  Used to replace invalid characters.
87 const irr::u16 UTF_REPLACEMENT_CHARACTER = 0xFFFD;
88
89 //! Convert a UTF-16 surrogate pair into a UTF-32 character.
90 //! \param high The high value of the pair.
91 //! \param low The low value of the pair.
92 //! \return The UTF-32 character expressed by the surrogate pair.
93 inline uchar32_t toUTF32(uchar16_t high, uchar16_t low)
94 {
95         // Convert the surrogate pair into a single UTF-32 character.
96         uchar32_t x = ((high & ((1 << 6) -1)) << 10) | (low & ((1 << 10) -1));
97         uchar32_t wu = ((high >> 6) & ((1 << 5) - 1)) + 1;
98         return (wu << 16) | x;
99 }
100
101 //! Swaps the endianness of a 16-bit value.
102 //! \return The new value.
103 inline uchar16_t swapEndian16(const uchar16_t& c)
104 {
105         return ((c >> 8) & 0x00FF) | ((c << 8) & 0xFF00);
106 }
107
108 //! Swaps the endianness of a 32-bit value.
109 //! \return The new value.
110 inline uchar32_t swapEndian32(const uchar32_t& c)
111 {
112         return  ((c >> 24) & 0x000000FF) |
113                         ((c >> 8)  & 0x0000FF00) |
114                         ((c << 8)  & 0x00FF0000) |
115                         ((c << 24) & 0xFF000000);
116 }
117
118 //! The Unicode byte order mark.
119 const u16 BOM = 0xFEFF;
120
121 //! The size of the Unicode byte order mark in terms of the Unicode character size.
122 const u8 BOM_UTF8_LEN = 3;
123 const u8 BOM_UTF16_LEN = 1;
124 const u8 BOM_UTF32_LEN = 1;
125
126 //! Unicode byte order marks for file operations.
127 const u8 BOM_ENCODE_UTF8[3] = { 0xEF, 0xBB, 0xBF };
128 const u8 BOM_ENCODE_UTF16_BE[2] = { 0xFE, 0xFF };
129 const u8 BOM_ENCODE_UTF16_LE[2] = { 0xFF, 0xFE };
130 const u8 BOM_ENCODE_UTF32_BE[4] = { 0x00, 0x00, 0xFE, 0xFF };
131 const u8 BOM_ENCODE_UTF32_LE[4] = { 0xFF, 0xFE, 0x00, 0x00 };
132
133 //! The size in bytes of the Unicode byte marks for file operations.
134 const u8 BOM_ENCODE_UTF8_LEN = 3;
135 const u8 BOM_ENCODE_UTF16_LEN = 2;
136 const u8 BOM_ENCODE_UTF32_LEN = 4;
137
138 //! Unicode encoding type.
139 enum EUTF_ENCODE
140 {
141         EUTFE_NONE              = 0,
142         EUTFE_UTF8,
143         EUTFE_UTF16,
144         EUTFE_UTF16_LE,
145         EUTFE_UTF16_BE,
146         EUTFE_UTF32,
147         EUTFE_UTF32_LE,
148         EUTFE_UTF32_BE
149 };
150
151 //! Unicode endianness.
152 enum EUTF_ENDIAN
153 {
154         EUTFEE_NATIVE   = 0,
155         EUTFEE_LITTLE,
156         EUTFEE_BIG
157 };
158
159 //! Returns the specified unicode byte order mark in a byte array.
160 //! The byte order mark is the first few bytes in a text file that signifies its encoding.
161 /** \param mode The Unicode encoding method that we want to get the byte order mark for.
162                 If EUTFE_UTF16 or EUTFE_UTF32 is passed, it uses the native system endianness. **/
163 //! \return An array that contains a byte order mark.
164 inline core::array<u8> getUnicodeBOM(EUTF_ENCODE mode)
165 {
166 #define COPY_ARRAY(source, size) \
167         memcpy(ret.pointer(), source, size); \
168         ret.set_used(size)
169
170         core::array<u8> ret(4);
171         switch (mode)
172         {
173                 case EUTFE_UTF8:
174                         COPY_ARRAY(BOM_ENCODE_UTF8, BOM_ENCODE_UTF8_LEN);
175                         break;
176                 case EUTFE_UTF16:
177                         #if __BYTE_ORDER == __BIG_ENDIAN
178                                 COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
179                         #else
180                                 COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
181                         #endif
182                         break;
183                 case EUTFE_UTF16_BE:
184                         COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
185                         break;
186                 case EUTFE_UTF16_LE:
187                         COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
188                         break;
189                 case EUTFE_UTF32:
190                         #if __BYTE_ORDER == __BIG_ENDIAN
191                                 COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
192                         #else
193                                 COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
194                         #endif
195                         break;
196                 case EUTFE_UTF32_BE:
197                         COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
198                         break;
199                 case EUTFE_UTF32_LE:
200                         COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
201                         break;
202                 case EUTFE_NONE:
203                         // TODO sapier: fixed warning only,
204                         // don't know if something needs to be done here
205                         break;
206         }
207         return ret;
208
209 #undef COPY_ARRAY
210 }
211
212 //! Detects if the given data stream starts with a unicode BOM.
213 //! \param data The data stream to check.
214 //! \return The unicode BOM associated with the data stream, or EUTFE_NONE if none was found.
215 inline EUTF_ENCODE determineUnicodeBOM(const char* data)
216 {
217         if (memcmp(data, BOM_ENCODE_UTF8, 3) == 0) return EUTFE_UTF8;
218         if (memcmp(data, BOM_ENCODE_UTF16_BE, 2) == 0) return EUTFE_UTF16_BE;
219         if (memcmp(data, BOM_ENCODE_UTF16_LE, 2) == 0) return EUTFE_UTF16_LE;
220         if (memcmp(data, BOM_ENCODE_UTF32_BE, 4) == 0) return EUTFE_UTF32_BE;
221         if (memcmp(data, BOM_ENCODE_UTF32_LE, 4) == 0) return EUTFE_UTF32_LE;
222         return EUTFE_NONE;
223 }
224
225 } // end namespace unicode
226
227
228 //! UTF-16 string class.
229 class ustring16
230 {
231 public:
232
233         ///------------------///
234         /// iterator classes ///
235         ///------------------///
236
237         //! Access an element in a unicode string, allowing one to change it.
238         class _ustring16_iterator_access
239         {
240                 public:
241                         _ustring16_iterator_access(const ustring16* s, u32 p) : ref(s), pos(p) {}
242
243                         //! Allow the class to be interpreted as a single UTF-32 character.
244                         operator uchar32_t() const
245                         {
246                                 return _get();
247                         }
248
249                         //! Allow one to change the character in the unicode string.
250                         //! \param c The new character to use.
251                         //! \return Myself.
252                         _ustring16_iterator_access& operator=(const uchar32_t c)
253                         {
254                                 _set(c);
255                                 return *this;
256                         }
257
258                         //! Increments the value by 1.
259                         //! \return Myself.
260                         _ustring16_iterator_access& operator++()
261                         {
262                                 _set(_get() + 1);
263                                 return *this;
264                         }
265
266                         //! Increments the value by 1, returning the old value.
267                         //! \return A unicode character.
268                         uchar32_t operator++(int)
269                         {
270                                 uchar32_t old = _get();
271                                 _set(old + 1);
272                                 return old;
273                         }
274
275                         //! Decrements the value by 1.
276                         //! \return Myself.
277                         _ustring16_iterator_access& operator--()
278                         {
279                                 _set(_get() - 1);
280                                 return *this;
281                         }
282
283                         //! Decrements the value by 1, returning the old value.
284                         //! \return A unicode character.
285                         uchar32_t operator--(int)
286                         {
287                                 uchar32_t old = _get();
288                                 _set(old - 1);
289                                 return old;
290                         }
291
292                         //! Adds to the value by a specified amount.
293                         //! \param val The amount to add to this character.
294                         //! \return Myself.
295                         _ustring16_iterator_access& operator+=(int val)
296                         {
297                                 _set(_get() + val);
298                                 return *this;
299                         }
300
301                         //! Subtracts from the value by a specified amount.
302                         //! \param val The amount to subtract from this character.
303                         //! \return Myself.
304                         _ustring16_iterator_access& operator-=(int val)
305                         {
306                                 _set(_get() - val);
307                                 return *this;
308                         }
309
310                         //! Multiples the value by a specified amount.
311                         //! \param val The amount to multiply this character by.
312                         //! \return Myself.
313                         _ustring16_iterator_access& operator*=(int val)
314                         {
315                                 _set(_get() * val);
316                                 return *this;
317                         }
318
319                         //! Divides the value by a specified amount.
320                         //! \param val The amount to divide this character by.
321                         //! \return Myself.
322                         _ustring16_iterator_access& operator/=(int val)
323                         {
324                                 _set(_get() / val);
325                                 return *this;
326                         }
327
328                         //! Modulos the value by a specified amount.
329                         //! \param val The amount to modulo this character by.
330                         //! \return Myself.
331                         _ustring16_iterator_access& operator%=(int val)
332                         {
333                                 _set(_get() % val);
334                                 return *this;
335                         }
336
337                         //! Adds to the value by a specified amount.
338                         //! \param val The amount to add to this character.
339                         //! \return A unicode character.
340                         uchar32_t operator+(int val) const
341                         {
342                                 return _get() + val;
343                         }
344
345                         //! Subtracts from the value by a specified amount.
346                         //! \param val The amount to subtract from this character.
347                         //! \return A unicode character.
348                         uchar32_t operator-(int val) const
349                         {
350                                 return _get() - val;
351                         }
352
353                         //! Multiplies the value by a specified amount.
354                         //! \param val The amount to multiply this character by.
355                         //! \return A unicode character.
356                         uchar32_t operator*(int val) const
357                         {
358                                 return _get() * val;
359                         }
360
361                         //! Divides the value by a specified amount.
362                         //! \param val The amount to divide this character by.
363                         //! \return A unicode character.
364                         uchar32_t operator/(int val) const
365                         {
366                                 return _get() / val;
367                         }
368
369                         //! Modulos the value by a specified amount.
370                         //! \param val The amount to modulo this character by.
371                         //! \return A unicode character.
372                         uchar32_t operator%(int val) const
373                         {
374                                 return _get() % val;
375                         }
376
377                 private:
378                         //! Gets a uchar32_t from our current position.
379                         uchar32_t _get() const
380                         {
381                                 const uchar16_t* a = ref->c_str();
382                                 if (!UTF16_IS_SURROGATE(a[pos]))
383                                         return static_cast<uchar32_t>(a[pos]);
384                                 else
385                                 {
386                                         if (pos + 1 >= ref->size_raw())
387                                                 return 0;
388
389                                         return unicode::toUTF32(a[pos], a[pos + 1]);
390                                 }
391                         }
392
393                         //! Sets a uchar32_t at our current position.
394                         void _set(uchar32_t c)
395                         {
396                                 ustring16* ref2 = const_cast<ustring16*>(ref);
397                                 const uchar16_t* a = ref2->c_str();
398                                 if (c > 0xFFFF)
399                                 {
400                                         // c will be multibyte, so split it up into the high and low surrogate pairs.
401                                         uchar16_t x = static_cast<uchar16_t>(c);
402                                         uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
403                                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
404
405                                         // If the previous position was a surrogate pair, just replace them.  Else, insert the low pair.
406                                         if (UTF16_IS_SURROGATE_HI(a[pos]) && pos + 1 != ref2->size_raw())
407                                                 ref2->replace_raw(vl, static_cast<u32>(pos) + 1);
408                                         else ref2->insert_raw(vl, static_cast<u32>(pos) + 1);
409
410                                         ref2->replace_raw(vh, static_cast<u32>(pos));
411                                 }
412                                 else
413                                 {
414                                         // c will be a single byte.
415                                         uchar16_t vh = static_cast<uchar16_t>(c);
416
417                                         // If the previous position was a surrogate pair, remove the extra byte.
418                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
419                                                 ref2->erase_raw(static_cast<u32>(pos) + 1);
420
421                                         ref2->replace_raw(vh, static_cast<u32>(pos));
422                                 }
423                         }
424
425                         const ustring16* ref;
426                         u32 pos;
427         };
428         typedef typename ustring16::_ustring16_iterator_access access;
429
430
431         //! Iterator to iterate through a UTF-16 string.
432         class _ustring16_const_iterator : public std::iterator<
433                 std::bidirectional_iterator_tag,        // iterator_category
434                 access,                                                         // value_type
435                 ptrdiff_t,                                                      // difference_type
436                 const access,                                           // pointer
437                 const access                                            // reference
438         >
439         {
440                 public:
441                         typedef _ustring16_const_iterator _Iter;
442                         typedef std::iterator<std::bidirectional_iterator_tag, access, ptrdiff_t, const access, const access> _Base;
443                         typedef const access const_pointer;
444                         typedef const access const_reference;
445
446                         typedef typename _Base::value_type value_type;
447                         typedef typename _Base::difference_type difference_type;
448                         typedef typename _Base::difference_type distance_type;
449                         typedef typename _Base::pointer pointer;
450                         typedef const_reference reference;
451
452                         //! Constructors.
453                         _ustring16_const_iterator(const _Iter& i) : ref(i.ref), pos(i.pos) {}
454                         _ustring16_const_iterator(const ustring16& s) : ref(&s), pos(0) {}
455                         _ustring16_const_iterator(const ustring16& s, const u32 p) : ref(&s), pos(0)
456                         {
457                                 if (ref->size_raw() == 0 || p == 0)
458                                         return;
459
460                                 // Go to the appropriate position.
461                                 u32 i = p;
462                                 u32 sr = ref->size_raw();
463                                 const uchar16_t* a = ref->c_str();
464                                 while (i != 0 && pos < sr)
465                                 {
466                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
467                                                 pos += 2;
468                                         else ++pos;
469                                         --i;
470                                 }
471                         }
472
473                         //! Test for equalness.
474                         bool operator==(const _Iter& iter) const
475                         {
476                                 if (ref == iter.ref && pos == iter.pos)
477                                         return true;
478                                 return false;
479                         }
480
481                         //! Test for unequalness.
482                         bool operator!=(const _Iter& iter) const
483                         {
484                                 if (ref != iter.ref || pos != iter.pos)
485                                         return true;
486                                 return false;
487                         }
488
489                         //! Switch to the next full character in the string.
490                         _Iter& operator++()
491                         {       // ++iterator
492                                 if (pos == ref->size_raw()) return *this;
493                                 const uchar16_t* a = ref->c_str();
494                                 if (UTF16_IS_SURROGATE_HI(a[pos]))
495                                         pos += 2;                       // TODO: check for valid low surrogate?
496                                 else ++pos;
497                                 if (pos > ref->size_raw()) pos = ref->size_raw();
498                                 return *this;
499                         }
500
501                         //! Switch to the next full character in the string, returning the previous position.
502                         _Iter operator++(int)
503                         {       // iterator++
504                                 _Iter _tmp(*this);
505                                 ++*this;
506                                 return _tmp;
507                         }
508
509                         //! Switch to the previous full character in the string.
510                         _Iter& operator--()
511                         {       // --iterator
512                                 if (pos == 0) return *this;
513                                 const uchar16_t* a = ref->c_str();
514                                 --pos;
515                                 if (UTF16_IS_SURROGATE_LO(a[pos]) && pos != 0)  // low surrogate, go back one more.
516                                         --pos;
517                                 return *this;
518                         }
519
520                         //! Switch to the previous full character in the string, returning the previous position.
521                         _Iter operator--(int)
522                         {       // iterator--
523                                 _Iter _tmp(*this);
524                                 --*this;
525                                 return _tmp;
526                         }
527
528                         //! Advance a specified number of full characters in the string.
529                         //! \return Myself.
530                         _Iter& operator+=(const difference_type v)
531                         {
532                                 if (v == 0) return *this;
533                                 if (v < 0) return operator-=(v * -1);
534
535                                 if (pos >= ref->size_raw())
536                                         return *this;
537
538                                 // Go to the appropriate position.
539                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
540                                 u32 i = (u32)v;
541                                 u32 sr = ref->size_raw();
542                                 const uchar16_t* a = ref->c_str();
543                                 while (i != 0 && pos < sr)
544                                 {
545                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
546                                                 pos += 2;
547                                         else ++pos;
548                                         --i;
549                                 }
550                                 if (pos > sr)
551                                         pos = sr;
552
553                                 return *this;
554                         }
555
556                         //! Go back a specified number of full characters in the string.
557                         //! \return Myself.
558                         _Iter& operator-=(const difference_type v)
559                         {
560                                 if (v == 0) return *this;
561                                 if (v > 0) return operator+=(v * -1);
562
563                                 if (pos == 0)
564                                         return *this;
565
566                                 // Go to the appropriate position.
567                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
568                                 u32 i = (u32)v;
569                                 const uchar16_t* a = ref->c_str();
570                                 while (i != 0 && pos != 0)
571                                 {
572                                         --pos;
573                                         if (UTF16_IS_SURROGATE_LO(a[pos]) != 0 && pos != 0)
574                                                 --pos;
575                                         --i;
576                                 }
577
578                                 return *this;
579                         }
580
581                         //! Return a new iterator that is a variable number of full characters forward from the current position.
582                         _Iter operator+(const difference_type v) const
583                         {
584                                 _Iter ret(*this);
585                                 ret += v;
586                                 return ret;
587                         }
588
589                         //! Return a new iterator that is a variable number of full characters backward from the current position.
590                         _Iter operator-(const difference_type v) const
591                         {
592                                 _Iter ret(*this);
593                                 ret -= v;
594                                 return ret;
595                         }
596
597                         //! Returns the distance between two iterators.
598                         difference_type operator-(const _Iter& iter) const
599                         {
600                                 // Make sure we reference the same object!
601                                 if (ref != iter.ref)
602                                         return difference_type();
603
604                                 _Iter i = iter;
605                                 difference_type ret;
606
607                                 // Walk up.
608                                 if (pos > i.pos)
609                                 {
610                                         while (pos > i.pos)
611                                         {
612                                                 ++i;
613                                                 ++ret;
614                                         }
615                                         return ret;
616                                 }
617
618                                 // Walk down.
619                                 while (pos < i.pos)
620                                 {
621                                         --i;
622                                         --ret;
623                                 }
624                                 return ret;
625                         }
626
627                         //! Accesses the full character at the iterator's position.
628                         const_reference operator*() const
629                         {
630                                 if (pos >= ref->size_raw())
631                                 {
632                                         const uchar16_t* a = ref->c_str();
633                                         u32 p = ref->size_raw();
634                                         if (UTF16_IS_SURROGATE_LO(a[p]))
635                                                 --p;
636                                         reference ret(ref, p);
637                                         return ret;
638                                 }
639                                 const_reference ret(ref, pos);
640                                 return ret;
641                         }
642
643                         //! Accesses the full character at the iterator's position.
644                         reference operator*()
645                         {
646                                 if (pos >= ref->size_raw())
647                                 {
648                                         const uchar16_t* a = ref->c_str();
649                                         u32 p = ref->size_raw();
650                                         if (UTF16_IS_SURROGATE_LO(a[p]))
651                                                 --p;
652                                         reference ret(ref, p);
653                                         return ret;
654                                 }
655                                 reference ret(ref, pos);
656                                 return ret;
657                         }
658
659                         //! Accesses the full character at the iterator's position.
660                         const_pointer operator->() const
661                         {
662                                 return operator*();
663                         }
664
665                         //! Accesses the full character at the iterator's position.
666                         pointer operator->()
667                         {
668                                 return operator*();
669                         }
670
671                         //! Is the iterator at the start of the string?
672                         bool atStart() const
673                         {
674                                 return pos == 0;
675                         }
676
677                         //! Is the iterator at the end of the string?
678                         bool atEnd() const
679                         {
680                                 const uchar16_t* a = ref->c_str();
681                                 if (UTF16_IS_SURROGATE(a[pos]))
682                                         return (pos + 1) >= ref->size_raw();
683                                 else return pos >= ref->size_raw();
684                         }
685
686                         //! Moves the iterator to the start of the string.
687                         void toStart()
688                         {
689                                 pos = 0;
690                         }
691
692                         //! Moves the iterator to the end of the string.
693                         void toEnd()
694                         {
695                                 pos = ref->size_raw();
696                         }
697
698                         //! Returns the iterator's position.
699                         //! \return The iterator's position.
700                         u32 getPos() const
701                         {
702                                 return pos;
703                         }
704
705                 protected:
706                         const ustring16* ref;
707                         u32 pos;
708         };
709
710         //! Iterator to iterate through a UTF-16 string.
711         class _ustring16_iterator : public _ustring16_const_iterator
712         {
713                 public:
714                         typedef _ustring16_iterator _Iter;
715                         typedef _ustring16_const_iterator _Base;
716                         typedef typename _Base::const_pointer const_pointer;
717                         typedef typename _Base::const_reference const_reference;
718
719
720                         typedef typename _Base::value_type value_type;
721                         typedef typename _Base::difference_type difference_type;
722                         typedef typename _Base::distance_type distance_type;
723                         typedef access pointer;
724                         typedef access reference;
725
726                         using _Base::pos;
727                         using _Base::ref;
728
729                         //! Constructors.
730                         _ustring16_iterator(const _Iter& i) : _ustring16_const_iterator(i) {}
731                         _ustring16_iterator(const ustring16& s) : _ustring16_const_iterator(s) {}
732                         _ustring16_iterator(const ustring16& s, const u32 p) : _ustring16_const_iterator(s, p) {}
733
734                         //! Accesses the full character at the iterator's position.
735                         reference operator*() const
736                         {
737                                 if (pos >= ref->size_raw())
738                                 {
739                                         const uchar16_t* a = ref->c_str();
740                                         u32 p = ref->size_raw();
741                                         if (UTF16_IS_SURROGATE_LO(a[p]))
742                                                 --p;
743                                         reference ret(ref, p);
744                                         return ret;
745                                 }
746                                 reference ret(ref, pos);
747                                 return ret;
748                         }
749
750                         //! Accesses the full character at the iterator's position.
751                         reference operator*()
752                         {
753                                 if (pos >= ref->size_raw())
754                                 {
755                                         const uchar16_t* a = ref->c_str();
756                                         u32 p = ref->size_raw();
757                                         if (UTF16_IS_SURROGATE_LO(a[p]))
758                                                 --p;
759                                         reference ret(ref, p);
760                                         return ret;
761                                 }
762                                 reference ret(ref, pos);
763                                 return ret;
764                         }
765
766                         //! Accesses the full character at the iterator's position.
767                         pointer operator->() const
768                         {
769                                 return operator*();
770                         }
771
772                         //! Accesses the full character at the iterator's position.
773                         pointer operator->()
774                         {
775                                 return operator*();
776                         }
777         };
778
779         typedef typename ustring16::_ustring16_iterator iterator;
780         typedef typename ustring16::_ustring16_const_iterator const_iterator;
781
782         ///----------------------///
783         /// end iterator classes ///
784         ///----------------------///
785
786         //! Default constructor
787         ustring16()
788         : array(0), allocated(1), used(0)
789         {
790 #if __BYTE_ORDER == __BIG_ENDIAN
791                 encoding = unicode::EUTFE_UTF16_BE;
792 #else
793                 encoding = unicode::EUTFE_UTF16_LE;
794 #endif
795                 array = new uchar16_t[1];
796                 array[0] = 0x0;
797         }
798
799
800         //! Constructor
801         ustring16(const ustring16& other)
802         : array(0), allocated(0), used(0)
803         {
804 #if __BYTE_ORDER == __BIG_ENDIAN
805                 encoding = unicode::EUTFE_UTF16_BE;
806 #else
807                 encoding = unicode::EUTFE_UTF16_LE;
808 #endif
809                 *this = other;
810         }
811
812
813         //! Constructor from other string types
814         template <class B>
815         ustring16(const string<B>& other)
816         : array(0), allocated(0), used(0)
817         {
818 #if __BYTE_ORDER == __BIG_ENDIAN
819                 encoding = unicode::EUTFE_UTF16_BE;
820 #else
821                 encoding = unicode::EUTFE_UTF16_LE;
822 #endif
823                 *this = other;
824         }
825
826         //! Constructor from std::string
827         template <class B, class A, typename Alloc>
828         ustring16(const std::basic_string<B, A, Alloc>& other)
829         : array(0), allocated(0), used(0)
830         {
831 #if __BYTE_ORDER == __BIG_ENDIAN
832                 encoding = unicode::EUTFE_UTF16_BE;
833 #else
834                 encoding = unicode::EUTFE_UTF16_LE;
835 #endif
836                 *this = other.c_str();
837         }
838
839
840         //! Constructor from iterator.
841         template <typename Itr>
842         ustring16(Itr first, Itr last)
843         : array(0), allocated(0), used(0)
844         {
845 #if __BYTE_ORDER == __BIG_ENDIAN
846                 encoding = unicode::EUTFE_UTF16_BE;
847 #else
848                 encoding = unicode::EUTFE_UTF16_LE;
849 #endif
850                 reserve(std::distance(first, last));
851                 array[used] = 0;
852
853                 for (; first != last; ++first)
854                         append((uchar32_t)*first);
855         }
856
857         //! Constructor for copying a UTF-8 string from a pointer.
858         ustring16(const uchar8_t* const c)
859         : array(0), allocated(0), used(0)
860         {
861 #if __BYTE_ORDER == __BIG_ENDIAN
862                 encoding = unicode::EUTFE_UTF16_BE;
863 #else
864                 encoding = unicode::EUTFE_UTF16_LE;
865 #endif
866
867                 append(c);
868         }
869
870
871         //! Constructor for copying a UTF-8 string from a single char.
872         ustring16(const char c)
873         : array(0), allocated(0), used(0)
874         {
875 #if __BYTE_ORDER == __BIG_ENDIAN
876                 encoding = unicode::EUTFE_UTF16_BE;
877 #else
878                 encoding = unicode::EUTFE_UTF16_LE;
879 #endif
880
881                 append((uchar32_t)c);
882         }
883
884
885         //! Constructor for copying a UTF-8 string from a pointer with a given length.
886         ustring16(const uchar8_t* const c, u32 length)
887         : array(0), allocated(0), used(0)
888         {
889 #if __BYTE_ORDER == __BIG_ENDIAN
890                 encoding = unicode::EUTFE_UTF16_BE;
891 #else
892                 encoding = unicode::EUTFE_UTF16_LE;
893 #endif
894
895                 append(c, length);
896         }
897
898
899         //! Constructor for copying a UTF-16 string from a pointer.
900         ustring16(const uchar16_t* const c)
901         : array(0), allocated(0), used(0)
902         {
903 #if __BYTE_ORDER == __BIG_ENDIAN
904                 encoding = unicode::EUTFE_UTF16_BE;
905 #else
906                 encoding = unicode::EUTFE_UTF16_LE;
907 #endif
908
909                 append(c);
910         }
911
912
913         //! Constructor for copying a UTF-16 string from a pointer with a given length
914         ustring16(const uchar16_t* const c, u32 length)
915         : array(0), allocated(0), used(0)
916         {
917 #if __BYTE_ORDER == __BIG_ENDIAN
918                 encoding = unicode::EUTFE_UTF16_BE;
919 #else
920                 encoding = unicode::EUTFE_UTF16_LE;
921 #endif
922
923                 append(c, length);
924         }
925
926
927         //! Constructor for copying a UTF-32 string from a pointer.
928         ustring16(const uchar32_t* const c)
929         : array(0), allocated(0), used(0)
930         {
931 #if __BYTE_ORDER == __BIG_ENDIAN
932                 encoding = unicode::EUTFE_UTF16_BE;
933 #else
934                 encoding = unicode::EUTFE_UTF16_LE;
935 #endif
936
937                 append(c);
938         }
939
940
941         //! Constructor for copying a UTF-32 from a pointer with a given length.
942         ustring16(const uchar32_t* const c, u32 length)
943         : array(0), allocated(0), used(0)
944         {
945 #if __BYTE_ORDER == __BIG_ENDIAN
946                 encoding = unicode::EUTFE_UTF16_BE;
947 #else
948                 encoding = unicode::EUTFE_UTF16_LE;
949 #endif
950
951                 append(c, length);
952         }
953
954
955         //! Constructor for copying a wchar_t string from a pointer.
956         ustring16(const wchar_t* const c)
957         : array(0), allocated(0), used(0)
958         {
959 #if __BYTE_ORDER == __BIG_ENDIAN
960                 encoding = unicode::EUTFE_UTF16_BE;
961 #else
962                 encoding = unicode::EUTFE_UTF16_LE;
963 #endif
964
965                 if (sizeof(wchar_t) == 4)
966                         append(reinterpret_cast<const uchar32_t*>(c));
967                 else if (sizeof(wchar_t) == 2)
968                         append(reinterpret_cast<const uchar16_t*>(c));
969                 else if (sizeof(wchar_t) == 1)
970                         append(reinterpret_cast<const uchar8_t*>(c));
971         }
972
973
974         //! Constructor for copying a wchar_t string from a pointer with a given length.
975         ustring16(const wchar_t* const c, u32 length)
976         : array(0), allocated(0), used(0)
977         {
978 #if __BYTE_ORDER == __BIG_ENDIAN
979                 encoding = unicode::EUTFE_UTF16_BE;
980 #else
981                 encoding = unicode::EUTFE_UTF16_LE;
982 #endif
983
984                 if (sizeof(wchar_t) == 4)
985                         append(reinterpret_cast<const uchar32_t*>(c), length);
986                 else if (sizeof(wchar_t) == 2)
987                         append(reinterpret_cast<const uchar16_t*>(c), length);
988                 else if (sizeof(wchar_t) == 1)
989                         append(reinterpret_cast<const uchar8_t*>(c), length);
990         }
991
992
993         //! Constructor for moving a ustring16
994         ustring16(ustring16&& other)
995         : array(other.array), encoding(other.encoding), allocated(other.allocated), used(other.used)
996         {
997                 other.array = 0;
998                 other.allocated = 0;
999                 other.used = 0;
1000         }
1001
1002         //! Destructor
1003         ~ustring16()
1004         {
1005                 delete [] array;
1006         }
1007
1008
1009         //! Assignment operator
1010         ustring16& operator=(const ustring16& other)
1011         {
1012                 if (this == &other)
1013                         return *this;
1014
1015                 used = other.size_raw();
1016                 if (used >= allocated)
1017                 {
1018                         delete [] array;
1019                         allocated = used + 1;
1020                         array = new uchar16_t[used + 1];
1021                 }
1022
1023                 const uchar16_t* p = other.c_str();
1024                 for (u32 i=0; i<=used; ++i, ++p)
1025                         array[i] = *p;
1026
1027                 array[used] = 0;
1028
1029                 // Validate our new UTF-16 string.
1030                 validate();
1031
1032                 return *this;
1033         }
1034
1035         //! Move assignment operator
1036         ustring16& operator=(ustring16&& other)
1037         {
1038                 if (this != &other)
1039                 {
1040                         delete [] array;
1041
1042                         array = other.array;
1043                         allocated = other.allocated;
1044                         encoding = other.encoding;
1045                         used = other.used;
1046                         other.array = 0;
1047                         other.used = 0;
1048                 }
1049                 return *this;
1050         }
1051
1052         //! Assignment operator for other string types
1053         template <class B>
1054         ustring16& operator=(const string<B>& other)
1055         {
1056                 *this = other.c_str();
1057                 return *this;
1058         }
1059
1060
1061         //! Assignment operator for UTF-8 strings
1062         ustring16& operator=(const uchar8_t* const c)
1063         {
1064                 if (!array)
1065                 {
1066                         array = new uchar16_t[1];
1067                         allocated = 1;
1068                 }
1069                 used = 0;
1070                 array[used] = 0x0;
1071                 if (!c) return *this;
1072
1073                 append(c);
1074                 return *this;
1075         }
1076
1077
1078         //! Assignment operator for UTF-16 strings
1079         ustring16& operator=(const uchar16_t* const c)
1080         {
1081                 if (!array)
1082                 {
1083                         array = new uchar16_t[1];
1084                         allocated = 1;
1085                 }
1086                 used = 0;
1087                 array[used] = 0x0;
1088                 if (!c) return *this;
1089
1090                 append(c);
1091                 return *this;
1092         }
1093
1094
1095         //! Assignment operator for UTF-32 strings
1096         ustring16& operator=(const uchar32_t* const c)
1097         {
1098                 if (!array)
1099                 {
1100                         array = new uchar16_t[1];
1101                         allocated = 1;
1102                 }
1103                 used = 0;
1104                 array[used] = 0x0;
1105                 if (!c) return *this;
1106
1107                 append(c);
1108                 return *this;
1109         }
1110
1111
1112         //! Assignment operator for wchar_t strings.
1113         /** Note that this assumes that a correct unicode string is stored in the wchar_t string.
1114                 Since wchar_t changes depending on its platform, it could either be a UTF-8, -16, or -32 string.
1115                 This function assumes you are storing the correct unicode encoding inside the wchar_t string. **/
1116         ustring16& operator=(const wchar_t* const c)
1117         {
1118                 if (sizeof(wchar_t) == 4)
1119                         *this = reinterpret_cast<const uchar32_t*>(c);
1120                 else if (sizeof(wchar_t) == 2)
1121                         *this = reinterpret_cast<const uchar16_t*>(c);
1122                 else if (sizeof(wchar_t) == 1)
1123                         *this = reinterpret_cast<const uchar8_t*>(c);
1124
1125                 return *this;
1126         }
1127
1128
1129         //! Assignment operator for other strings.
1130         /** Note that this assumes that a correct unicode string is stored in the string. **/
1131         template <class B>
1132         ustring16& operator=(const B* const c)
1133         {
1134                 if (sizeof(B) == 4)
1135                         *this = reinterpret_cast<const uchar32_t* const>(c);
1136                 else if (sizeof(B) == 2)
1137                         *this = reinterpret_cast<const uchar16_t* const>(c);
1138                 else if (sizeof(B) == 1)
1139                         *this = reinterpret_cast<const uchar8_t* const>(c);
1140
1141                 return *this;
1142         }
1143
1144
1145         //! Direct access operator
1146         access operator [](const u32 index)
1147         {
1148                 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1149                 iterator iter(*this, index);
1150                 return iter.operator*();
1151         }
1152
1153
1154         //! Direct access operator
1155         const access operator [](const u32 index) const
1156         {
1157                 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1158                 const_iterator iter(*this, index);
1159                 return iter.operator*();
1160         }
1161
1162
1163         //! Equality operator
1164         bool operator ==(const uchar16_t* const str) const
1165         {
1166                 if (!str)
1167                         return false;
1168
1169                 u32 i;
1170                 for(i=0; array[i] && str[i]; ++i)
1171                         if (array[i] != str[i])
1172                                 return false;
1173
1174                 return !array[i] && !str[i];
1175         }
1176
1177
1178         //! Equality operator
1179         bool operator ==(const ustring16& other) const
1180         {
1181                 for(u32 i=0; array[i] && other.array[i]; ++i)
1182                         if (array[i] != other.array[i])
1183                                 return false;
1184
1185                 return used == other.used;
1186         }
1187
1188
1189         //! Is smaller comparator
1190         bool operator <(const ustring16& other) const
1191         {
1192                 for(u32 i=0; array[i] && other.array[i]; ++i)
1193                 {
1194                         s32 diff = array[i] - other.array[i];
1195                         if ( diff )
1196                                 return diff < 0;
1197                 }
1198
1199                 return used < other.used;
1200         }
1201
1202
1203         //! Inequality operator
1204         bool operator !=(const uchar16_t* const str) const
1205         {
1206                 return !(*this == str);
1207         }
1208
1209
1210         //! Inequality operator
1211         bool operator !=(const ustring16& other) const
1212         {
1213                 return !(*this == other);
1214         }
1215
1216
1217         //! Returns the length of a ustring16 in full characters.
1218         //! \return Length of a ustring16 in full characters.
1219         u32 size() const
1220         {
1221                 const_iterator i(*this, 0);
1222                 u32 pos = 0;
1223                 while (!i.atEnd())
1224                 {
1225                         ++i;
1226                         ++pos;
1227                 }
1228                 return pos;
1229         }
1230
1231
1232         //! Informs if the ustring is empty or not.
1233         //! \return True if the ustring is empty, false if not.
1234         bool empty() const
1235         {
1236                 return (size_raw() == 0);
1237         }
1238
1239
1240         //! Returns a pointer to the raw UTF-16 string data.
1241         //! \return pointer to C-style NUL terminated array of UTF-16 code points.
1242         const uchar16_t* c_str() const
1243         {
1244                 return array;
1245         }
1246
1247
1248         //! Compares the first n characters of this string with another.
1249         //! \param other Other string to compare to.
1250         //! \param n Number of characters to compare.
1251         //! \return True if the n first characters of both strings are equal.
1252         bool equalsn(const ustring16& other, u32 n) const
1253         {
1254                 u32 i;
1255                 const uchar16_t* oa = other.c_str();
1256                 for(i=0; i < n && array[i] && oa[i]; ++i)
1257                         if (array[i] != oa[i])
1258                                 return false;
1259
1260                 // if one (or both) of the strings was smaller then they
1261                 // are only equal if they have the same length
1262                 return (i == n) || (used == other.used);
1263         }
1264
1265
1266         //! Compares the first n characters of this string with another.
1267         //! \param str Other string to compare to.
1268         //! \param n Number of characters to compare.
1269         //! \return True if the n first characters of both strings are equal.
1270         bool equalsn(const uchar16_t* const str, u32 n) const
1271         {
1272                 if (!str)
1273                         return false;
1274                 u32 i;
1275                 for(i=0; i < n && array[i] && str[i]; ++i)
1276                         if (array[i] != str[i])
1277                                 return false;
1278
1279                 // if one (or both) of the strings was smaller then they
1280                 // are only equal if they have the same length
1281                 return (i == n) || (array[i] == 0 && str[i] == 0);
1282         }
1283
1284
1285         //! Appends a character to this ustring16
1286         //! \param character The character to append.
1287         //! \return A reference to our current string.
1288         ustring16& append(uchar32_t character)
1289         {
1290                 if (used + 2 >= allocated)
1291                         reallocate(used + 2);
1292
1293                 if (character > 0xFFFF)
1294                 {
1295                         used += 2;
1296
1297                         // character will be multibyte, so split it up into a surrogate pair.
1298                         uchar16_t x = static_cast<uchar16_t>(character);
1299                         uchar16_t vh = UTF16_HI_SURROGATE | ((((character >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1300                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1301                         array[used-2] = vh;
1302                         array[used-1] = vl;
1303                 }
1304                 else
1305                 {
1306                         ++used;
1307                         array[used-1] = character;
1308                 }
1309                 array[used] = 0;
1310
1311                 return *this;
1312         }
1313
1314
1315         //! Appends a UTF-8 string to this ustring16
1316         //! \param other The UTF-8 string to append.
1317         //! \param length The length of the string to append.
1318         //! \return A reference to our current string.
1319         ustring16& append(const uchar8_t* const other, u32 length=0xffffffff)
1320         {
1321                 if (!other)
1322                         return *this;
1323
1324                 // Determine if the string is long enough for a BOM.
1325                 u32 len = 0;
1326                 const uchar8_t* p = other;
1327                 do
1328                 {
1329                         ++len;
1330                 } while (*p++ && len < unicode::BOM_ENCODE_UTF8_LEN);
1331
1332                 // Check for BOM.
1333                 unicode::EUTF_ENCODE c_bom = unicode::EUTFE_NONE;
1334                 if (len == unicode::BOM_ENCODE_UTF8_LEN)
1335                 {
1336                         if (memcmp(other, unicode::BOM_ENCODE_UTF8, unicode::BOM_ENCODE_UTF8_LEN) == 0)
1337                                 c_bom = unicode::EUTFE_UTF8;
1338                 }
1339
1340                 // If a BOM was found, don't include it in the string.
1341                 const uchar8_t* c2 = other;
1342                 if (c_bom != unicode::EUTFE_NONE)
1343                 {
1344                         c2 = other + unicode::BOM_UTF8_LEN;
1345                         length -= unicode::BOM_UTF8_LEN;
1346                 }
1347
1348                 // Calculate the size of the string to read in.
1349                 len = 0;
1350                 p = c2;
1351                 do
1352                 {
1353                         ++len;
1354                 } while(*p++ && len < length);
1355                 if (len > length)
1356                         len = length;
1357
1358                 // If we need to grow the array, do it now.
1359                 if (used + len >= allocated)
1360                         reallocate(used + (len * 2));
1361                 u32 start = used;
1362
1363                 // Convert UTF-8 to UTF-16.
1364                 u32 pos = start;
1365                 for (u32 l = 0; l<len;)
1366                 {
1367                         ++used;
1368                         if (((c2[l] >> 6) & 0x03) == 0x02)
1369                         {       // Invalid continuation byte.
1370                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1371                                 ++l;
1372                         }
1373                         else if (c2[l] == 0xC0 || c2[l] == 0xC1)
1374                         {       // Invalid byte - overlong encoding.
1375                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1376                                 ++l;
1377                         }
1378                         else if ((c2[l] & 0xF8) == 0xF0)
1379                         {       // 4 bytes UTF-8, 2 bytes UTF-16.
1380                                 // Check for a full string.
1381                                 if ((l + 3) >= len)
1382                                 {
1383                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1384                                         l += 3;
1385                                         break;
1386                                 }
1387
1388                                 // Validate.
1389                                 bool valid = true;
1390                                 u8 l2 = 0;
1391                                 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1392                                 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1393                                 if (valid && (((c2[l+3] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1394                                 if (!valid)
1395                                 {
1396                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1397                                         l += l2;
1398                                         continue;
1399                                 }
1400
1401                                 // Decode.
1402                                 uchar8_t b1 = ((c2[l] & 0x7) << 2) | ((c2[l+1] >> 4) & 0x3);
1403                                 uchar8_t b2 = ((c2[l+1] & 0xF) << 4) | ((c2[l+2] >> 2) & 0xF);
1404                                 uchar8_t b3 = ((c2[l+2] & 0x3) << 6) | (c2[l+3] & 0x3F);
1405                                 uchar32_t v = b3 | ((uchar32_t)b2 << 8) | ((uchar32_t)b1 << 16);
1406
1407                                 // Split v up into a surrogate pair.
1408                                 uchar16_t x = static_cast<uchar16_t>(v);
1409                                 uchar16_t vh = UTF16_HI_SURROGATE | ((((v >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1410                                 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1411
1412                                 array[pos++] = vh;
1413                                 array[pos++] = vl;
1414                                 l += 4;
1415                                 ++used;         // Using two shorts this time, so increase used by 1.
1416                         }
1417                         else if ((c2[l] & 0xF0) == 0xE0)
1418                         {       // 3 bytes UTF-8, 1 byte UTF-16.
1419                                 // Check for a full string.
1420                                 if ((l + 2) >= len)
1421                                 {
1422                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1423                                         l += 2;
1424                                         break;
1425                                 }
1426
1427                                 // Validate.
1428                                 bool valid = true;
1429                                 u8 l2 = 0;
1430                                 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1431                                 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1432                                 if (!valid)
1433                                 {
1434                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1435                                         l += l2;
1436                                         continue;
1437                                 }
1438
1439                                 // Decode.
1440                                 uchar8_t b1 = ((c2[l] & 0xF) << 4) | ((c2[l+1] >> 2) & 0xF);
1441                                 uchar8_t b2 = ((c2[l+1] & 0x3) << 6) | (c2[l+2] & 0x3F);
1442                                 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1443                                 array[pos++] = ch;
1444                                 l += 3;
1445                         }
1446                         else if ((c2[l] & 0xE0) == 0xC0)
1447                         {       // 2 bytes UTF-8, 1 byte UTF-16.
1448                                 // Check for a full string.
1449                                 if ((l + 1) >= len)
1450                                 {
1451                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1452                                         l += 1;
1453                                         break;
1454                                 }
1455
1456                                 // Validate.
1457                                 if (((c2[l+1] >> 6) & 0x03) != 0x02)
1458                                 {
1459                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1460                                         ++l;
1461                                         continue;
1462                                 }
1463
1464                                 // Decode.
1465                                 uchar8_t b1 = (c2[l] >> 2) & 0x7;
1466                                 uchar8_t b2 = ((c2[l] & 0x3) << 6) | (c2[l+1] & 0x3F);
1467                                 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1468                                 array[pos++] = ch;
1469                                 l += 2;
1470                         }
1471                         else
1472                         {       // 1 byte UTF-8, 1 byte UTF-16.
1473                                 // Validate.
1474                                 if (c2[l] > 0x7F)
1475                                 {       // Values above 0xF4 are restricted and aren't used.  By now, anything above 0x7F is invalid.
1476                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1477                                 }
1478                                 else array[pos++] = static_cast<uchar16_t>(c2[l]);
1479                                 ++l;
1480                         }
1481                 }
1482                 array[used] = 0;
1483
1484                 // Validate our new UTF-16 string.
1485                 validate();
1486
1487                 return *this;
1488         }
1489
1490
1491         //! Appends a UTF-16 string to this ustring16
1492         //! \param other The UTF-16 string to append.
1493         //! \param length The length of the string to append.
1494         //! \return A reference to our current string.
1495         ustring16& append(const uchar16_t* const other, u32 length=0xffffffff)
1496         {
1497                 if (!other)
1498                         return *this;
1499
1500                 // Determine if the string is long enough for a BOM.
1501                 u32 len = 0;
1502                 const uchar16_t* p = other;
1503                 do
1504                 {
1505                         ++len;
1506                 } while (*p++ && len < unicode::BOM_ENCODE_UTF16_LEN);
1507
1508                 // Check for the BOM to determine the string's endianness.
1509                 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1510                 if (memcmp(other, unicode::BOM_ENCODE_UTF16_LE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1511                         c_end = unicode::EUTFEE_LITTLE;
1512                 else if (memcmp(other, unicode::BOM_ENCODE_UTF16_BE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1513                         c_end = unicode::EUTFEE_BIG;
1514
1515                 // If a BOM was found, don't include it in the string.
1516                 const uchar16_t* c2 = other;
1517                 if (c_end != unicode::EUTFEE_NATIVE)
1518                 {
1519                         c2 = other + unicode::BOM_UTF16_LEN;
1520                         length -= unicode::BOM_UTF16_LEN;
1521                 }
1522
1523                 // Calculate the size of the string to read in.
1524                 len = 0;
1525                 p = c2;
1526                 do
1527                 {
1528                         ++len;
1529                 } while(*p++ && len < length);
1530                 if (len > length)
1531                         len = length;
1532
1533                 // If we need to grow the size of the array, do it now.
1534                 if (used + len >= allocated)
1535                         reallocate(used + (len * 2));
1536                 u32 start = used;
1537                 used += len;
1538
1539                 // Copy the string now.
1540                 unicode::EUTF_ENDIAN m_end = getEndianness();
1541                 for (u32 l = start; l < start + len; ++l)
1542                 {
1543                         array[l] = (uchar16_t)c2[l];
1544                         if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1545                                 array[l] = unicode::swapEndian16(array[l]);
1546                 }
1547
1548                 array[used] = 0;
1549
1550                 // Validate our new UTF-16 string.
1551                 validate();
1552                 return *this;
1553         }
1554
1555
1556         //! Appends a UTF-32 string to this ustring16
1557         //! \param other The UTF-32 string to append.
1558         //! \param length The length of the string to append.
1559         //! \return A reference to our current string.
1560         ustring16& append(const uchar32_t* const other, u32 length=0xffffffff)
1561         {
1562                 if (!other)
1563                         return *this;
1564
1565                 // Check for the BOM to determine the string's endianness.
1566                 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1567                 if (memcmp(other, unicode::BOM_ENCODE_UTF32_LE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1568                         c_end = unicode::EUTFEE_LITTLE;
1569                 else if (memcmp(other, unicode::BOM_ENCODE_UTF32_BE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1570                         c_end = unicode::EUTFEE_BIG;
1571
1572                 // If a BOM was found, don't include it in the string.
1573                 const uchar32_t* c2 = other;
1574                 if (c_end != unicode::EUTFEE_NATIVE)
1575                 {
1576                         c2 = other + unicode::BOM_UTF32_LEN;
1577                         length -= unicode::BOM_UTF32_LEN;
1578                 }
1579
1580                 // Calculate the size of the string to read in.
1581                 u32 len = 0;
1582                 const uchar32_t* p = c2;
1583                 do
1584                 {
1585                         ++len;
1586                 } while(*p++ && len < length);
1587                 if (len > length)
1588                         len = length;
1589
1590                 // If we need to grow the size of the array, do it now.
1591                 // In case all of the UTF-32 string is split into surrogate pairs, do len * 2.
1592                 if (used + (len * 2) >= allocated)
1593                         reallocate(used + ((len * 2) * 2));
1594                 u32 start = used;
1595
1596                 // Convert UTF-32 to UTF-16.
1597                 unicode::EUTF_ENDIAN m_end = getEndianness();
1598                 u32 pos = start;
1599                 for (u32 l = 0; l<len; ++l)
1600                 {
1601                         ++used;
1602
1603                         uchar32_t ch = c2[l];
1604                         if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1605                                 ch = unicode::swapEndian32(ch);
1606
1607                         if (ch > 0xFFFF)
1608                         {
1609                                 // Split ch up into a surrogate pair as it is over 16 bits long.
1610                                 uchar16_t x = static_cast<uchar16_t>(ch);
1611                                 uchar16_t vh = UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1612                                 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1613                                 array[pos++] = vh;
1614                                 array[pos++] = vl;
1615                                 ++used;         // Using two shorts, so increased used again.
1616                         }
1617                         else if (ch >= 0xD800 && ch <= 0xDFFF)
1618                         {
1619                                 // Between possible UTF-16 surrogates (invalid!)
1620                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1621                         }
1622                         else array[pos++] = static_cast<uchar16_t>(ch);
1623                 }
1624                 array[used] = 0;
1625
1626                 // Validate our new UTF-16 string.
1627                 validate();
1628
1629                 return *this;
1630         }
1631
1632
1633         //! Appends a ustring16 to this ustring16
1634         //! \param other The string to append to this one.
1635         //! \return A reference to our current string.
1636         ustring16& append(const ustring16& other)
1637         {
1638                 const uchar16_t* oa = other.c_str();
1639
1640                 u32 len = other.size_raw();
1641
1642                 if (used + len >= allocated)
1643                         reallocate(used + len);
1644
1645                 for (u32 l=0; l<len; ++l)
1646                         array[used+l] = oa[l];
1647
1648                 used += len;
1649                 array[used] = 0;
1650
1651                 return *this;
1652         }
1653
1654
1655         //! Appends a certain amount of characters of a ustring16 to this ustring16.
1656         //! \param other The string to append to this one.
1657         //! \param length How many characters of the other string to add to this one.
1658         //! \return A reference to our current string.
1659         ustring16& append(const ustring16& other, u32 length)
1660         {
1661                 if (other.size() == 0)
1662                         return *this;
1663
1664                 if (other.size() < length)
1665                 {
1666                         append(other);
1667                         return *this;
1668                 }
1669
1670                 if (used + length * 2 >= allocated)
1671                         reallocate(used + length * 2);
1672
1673                 const_iterator iter(other, 0);
1674                 u32 l = length;
1675                 while (!iter.atEnd() && l)
1676                 {
1677                         uchar32_t c = *iter;
1678                         append(c);
1679                         ++iter;
1680                         --l;
1681                 }
1682
1683                 return *this;
1684         }
1685
1686
1687         //! Reserves some memory.
1688         //! \param count The amount of characters to reserve.
1689         void reserve(u32 count)
1690         {
1691                 if (count < allocated)
1692                         return;
1693
1694                 reallocate(count);
1695         }
1696
1697
1698         //! Finds first occurrence of character.
1699         //! \param c The character to search for.
1700         //! \return Position where the character has been found, or -1 if not found.
1701         s32 findFirst(uchar32_t c) const
1702         {
1703                 const_iterator i(*this, 0);
1704
1705                 s32 pos = 0;
1706                 while (!i.atEnd())
1707                 {
1708                         uchar32_t t = *i;
1709                         if (c == t)
1710                                 return pos;
1711                         ++pos;
1712                         ++i;
1713                 }
1714
1715                 return -1;
1716         }
1717
1718         //! Finds first occurrence of a character of a list.
1719         //! \param c A list of characters to find. For example if the method should find the first occurrence of 'a' or 'b', this parameter should be "ab".
1720         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1721         //! \return Position where one of the characters has been found, or -1 if not found.
1722         s32 findFirstChar(const uchar32_t* const c, u32 count=1) const
1723         {
1724                 if (!c || !count)
1725                         return -1;
1726
1727                 const_iterator i(*this, 0);
1728
1729                 s32 pos = 0;
1730                 while (!i.atEnd())
1731                 {
1732                         uchar32_t t = *i;
1733                         for (u32 j=0; j<count; ++j)
1734                                 if (t == c[j])
1735                                         return pos;
1736                         ++pos;
1737                         ++i;
1738                 }
1739
1740                 return -1;
1741         }
1742
1743
1744         //! Finds first position of a character not in a given list.
1745         //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1746         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1747         //! \return Position where the character has been found, or -1 if not found.
1748         s32 findFirstCharNotInList(const uchar32_t* const c, u32 count=1) const
1749         {
1750                 if (!c || !count)
1751                         return -1;
1752
1753                 const_iterator i(*this, 0);
1754
1755                 s32 pos = 0;
1756                 while (!i.atEnd())
1757                 {
1758                         uchar32_t t = *i;
1759                         u32 j;
1760                         for (j=0; j<count; ++j)
1761                                 if (t == c[j])
1762                                         break;
1763
1764                         if (j==count)
1765                                 return pos;
1766                         ++pos;
1767                         ++i;
1768                 }
1769
1770                 return -1;
1771         }
1772
1773         //! Finds last position of a character not in a given list.
1774         //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1775         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1776         //! \return Position where the character has been found, or -1 if not found.
1777         s32 findLastCharNotInList(const uchar32_t* const c, u32 count=1) const
1778         {
1779                 if (!c || !count)
1780                         return -1;
1781
1782                 const_iterator i(end());
1783                 --i;
1784
1785                 s32 pos = size() - 1;
1786                 while (!i.atStart())
1787                 {
1788                         uchar32_t t = *i;
1789                         u32 j;
1790                         for (j=0; j<count; ++j)
1791                                 if (t == c[j])
1792                                         break;
1793
1794                         if (j==count)
1795                                 return pos;
1796                         --pos;
1797                         --i;
1798                 }
1799
1800                 return -1;
1801         }
1802
1803         //! Finds next occurrence of character.
1804         //! \param c The character to search for.
1805         //! \param startPos The position in the string to start searching.
1806         //! \return Position where the character has been found, or -1 if not found.
1807         s32 findNext(uchar32_t c, u32 startPos) const
1808         {
1809                 const_iterator i(*this, startPos);
1810
1811                 s32 pos = startPos;
1812                 while (!i.atEnd())
1813                 {
1814                         uchar32_t t = *i;
1815                         if (t == c)
1816                                 return pos;
1817                         ++pos;
1818                         ++i;
1819                 }
1820
1821                 return -1;
1822         }
1823
1824
1825         //! Finds last occurrence of character.
1826         //! \param c The character to search for.
1827         //! \param start The start position of the reverse search ( default = -1, on end ).
1828         //! \return Position where the character has been found, or -1 if not found.
1829         s32 findLast(uchar32_t c, s32 start = -1) const
1830         {
1831                 u32 s = size();
1832                 start = core::clamp ( start < 0 ? (s32)s : start, 0, (s32)s ) - 1;
1833
1834                 const_iterator i(*this, start);
1835                 u32 pos = start;
1836                 while (!i.atStart())
1837                 {
1838                         uchar32_t t = *i;
1839                         if (t == c)
1840                                 return pos;
1841                         --pos;
1842                         --i;
1843                 }
1844
1845                 return -1;
1846         }
1847
1848         //! Finds last occurrence of a character in a list.
1849         //! \param c A list of strings to find. For example if the method should find the last occurrence of 'a' or 'b', this parameter should be "ab".
1850         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1851         //! \return Position where one of the characters has been found, or -1 if not found.
1852         s32 findLastChar(const uchar32_t* const c, u32 count=1) const
1853         {
1854                 if (!c || !count)
1855                         return -1;
1856
1857                 const_iterator i(end());
1858                 --i;
1859
1860                 s32 pos = size();
1861                 while (!i.atStart())
1862                 {
1863                         uchar32_t t = *i;
1864                         for (u32 j=0; j<count; ++j)
1865                                 if (t == c[j])
1866                                         return pos;
1867                         --pos;
1868                         --i;
1869                 }
1870
1871                 return -1;
1872         }
1873
1874
1875         //! Finds another ustring16 in this ustring16.
1876         //! \param str The string to find.
1877         //! \param start The start position of the search.
1878         //! \return Positions where the ustring16 has been found, or -1 if not found.
1879         s32 find(const ustring16& str, const u32 start = 0) const
1880         {
1881                 u32 my_size = size();
1882                 u32 their_size = str.size();
1883
1884                 if (their_size == 0 || my_size - start < their_size)
1885                         return -1;
1886
1887                 const_iterator i(*this, start);
1888
1889                 s32 pos = start;
1890                 while (!i.atEnd())
1891                 {
1892                         const_iterator i2(i);
1893                         const_iterator j(str, 0);
1894                         uchar32_t t1 = (uchar32_t)*i2;
1895                         uchar32_t t2 = (uchar32_t)*j;
1896                         while (t1 == t2)
1897                         {
1898                                 ++i2;
1899                                 ++j;
1900                                 if (j.atEnd())
1901                                         return pos;
1902                                 t1 = (uchar32_t)*i2;
1903                                 t2 = (uchar32_t)*j;
1904                         }
1905                         ++i;
1906                         ++pos;
1907                 }
1908
1909                 return -1;
1910         }
1911
1912
1913         //! Finds another ustring16 in this ustring16.
1914         //! \param str The string to find.
1915         //! \param start The start position of the search.
1916         //! \return Positions where the string has been found, or -1 if not found.
1917         s32 find_raw(const ustring16& str, const u32 start = 0) const
1918         {
1919                 const uchar16_t* data = str.c_str();
1920                 if (data && *data)
1921                 {
1922                         u32 len = 0;
1923
1924                         while (data[len])
1925                                 ++len;
1926
1927                         if (len > used)
1928                                 return -1;
1929
1930                         for (u32 i=start; i<=used-len; ++i)
1931                         {
1932                                 u32 j=0;
1933
1934                                 while(data[j] && array[i+j] == data[j])
1935                                         ++j;
1936
1937                                 if (!data[j])
1938                                         return i;
1939                         }
1940                 }
1941
1942                 return -1;
1943         }
1944
1945
1946         //! Returns a substring.
1947         //! \param begin: Start of substring.
1948         //! \param length: Length of substring.
1949         //! \return A reference to our current string.
1950         ustring16 subString(u32 begin, s32 length) const
1951         {
1952                 u32 len = size();
1953                 // if start after ustring16
1954                 // or no proper substring length
1955                 if ((length <= 0) || (begin>=len))
1956                         return ustring16("");
1957                 // clamp length to maximal value
1958                 if ((length+begin) > len)
1959                         length = len-begin;
1960
1961                 ustring16 o;
1962                 o.reserve((length+1) * 2);
1963
1964                 const_iterator i(*this, begin);
1965                 while (!i.atEnd() && length)
1966                 {
1967                         o.append(*i);
1968                         ++i;
1969                         --length;
1970                 }
1971
1972                 return o;
1973         }
1974
1975
1976         //! Appends a character to this ustring16.
1977         //! \param c Character to append.
1978         //! \return A reference to our current string.
1979         ustring16& operator += (char c)
1980         {
1981                 append((uchar32_t)c);
1982                 return *this;
1983         }
1984
1985
1986         //! Appends a character to this ustring16.
1987         //! \param c Character to append.
1988         //! \return A reference to our current string.
1989         ustring16& operator += (uchar32_t c)
1990         {
1991                 append(c);
1992                 return *this;
1993         }
1994
1995
1996         //! Appends a number to this ustring16.
1997         //! \param c Number to append.
1998         //! \return A reference to our current string.
1999         ustring16& operator += (short c)
2000         {
2001                 append(core::stringc(c));
2002                 return *this;
2003         }
2004
2005
2006         //! Appends a number to this ustring16.
2007         //! \param c Number to append.
2008         //! \return A reference to our current string.
2009         ustring16& operator += (unsigned short c)
2010         {
2011                 append(core::stringc(c));
2012                 return *this;
2013         }
2014
2015
2016         //! Appends a number to this ustring16.
2017         //! \param c Number to append.
2018         //! \return A reference to our current string.
2019         ustring16& operator += (int c)
2020         {
2021                 append(core::stringc(c));
2022                 return *this;
2023         }
2024
2025
2026         //! Appends a number to this ustring16.
2027         //! \param c Number to append.
2028         //! \return A reference to our current string.
2029         ustring16& operator += (unsigned int c)
2030         {
2031                 append(core::stringc(c));
2032                 return *this;
2033         }
2034
2035
2036         //! Appends a number to this ustring16.
2037         //! \param c Number to append.
2038         //! \return A reference to our current string.
2039         ustring16& operator += (long c)
2040         {
2041                 append(core::stringc(c));
2042                 return *this;
2043         }
2044
2045
2046         //! Appends a number to this ustring16.
2047         //! \param c Number to append.
2048         //! \return A reference to our current string.
2049         ustring16& operator += (unsigned long c)
2050         {
2051                 append(core::stringc(c));
2052                 return *this;
2053         }
2054
2055
2056         //! Appends a number to this ustring16.
2057         //! \param c Number to append.
2058         //! \return A reference to our current string.
2059         ustring16& operator += (double c)
2060         {
2061                 append(core::stringc(c));
2062                 return *this;
2063         }
2064
2065
2066         //! Appends a char ustring16 to this ustring16.
2067         //! \param c Char ustring16 to append.
2068         //! \return A reference to our current string.
2069         ustring16& operator += (const uchar16_t* const c)
2070         {
2071                 append(c);
2072                 return *this;
2073         }
2074
2075
2076         //! Appends a ustring16 to this ustring16.
2077         //! \param other ustring16 to append.
2078         //! \return A reference to our current string.
2079         ustring16& operator += (const ustring16& other)
2080         {
2081                 append(other);
2082                 return *this;
2083         }
2084
2085
2086         //! Replaces all characters of a given type with another one.
2087         //! \param toReplace Character to replace.
2088         //! \param replaceWith Character replacing the old one.
2089         //! \return A reference to our current string.
2090         ustring16& replace(uchar32_t toReplace, uchar32_t replaceWith)
2091         {
2092                 iterator i(*this, 0);
2093                 while (!i.atEnd())
2094                 {
2095                         typename ustring16::access a = *i;
2096                         if ((uchar32_t)a == toReplace)
2097                                 a = replaceWith;
2098                         ++i;
2099                 }
2100                 return *this;
2101         }
2102
2103
2104         //! Replaces all instances of a string with another one.
2105         //! \param toReplace The string to replace.
2106         //! \param replaceWith The string replacing the old one.
2107         //! \return A reference to our current string.
2108         ustring16& replace(const ustring16& toReplace, const ustring16& replaceWith)
2109         {
2110                 if (toReplace.size() == 0)
2111                         return *this;
2112
2113                 const uchar16_t* other = toReplace.c_str();
2114                 const uchar16_t* replace = replaceWith.c_str();
2115                 const u32 other_size = toReplace.size_raw();
2116                 const u32 replace_size = replaceWith.size_raw();
2117
2118                 // Determine the delta.  The algorithm will change depending on the delta.
2119                 s32 delta = replace_size - other_size;
2120
2121                 // A character for character replace.  The string will not shrink or grow.
2122                 if (delta == 0)
2123                 {
2124                         s32 pos = 0;
2125                         while ((pos = find_raw(other, pos)) != -1)
2126                         {
2127                                 for (u32 i = 0; i < replace_size; ++i)
2128                                         array[pos + i] = replace[i];
2129                                 ++pos;
2130                         }
2131                         return *this;
2132                 }
2133
2134                 // We are going to be removing some characters.  The string will shrink.
2135                 if (delta < 0)
2136                 {
2137                         u32 i = 0;
2138                         for (u32 pos = 0; pos <= used; ++i, ++pos)
2139                         {
2140                                 // Is this potentially a match?
2141                                 if (array[pos] == *other)
2142                                 {
2143                                         // Check to see if we have a match.
2144                                         u32 j;
2145                                         for (j = 0; j < other_size; ++j)
2146                                         {
2147                                                 if (array[pos + j] != other[j])
2148                                                         break;
2149                                         }
2150
2151                                         // If we have a match, replace characters.
2152                                         if (j == other_size)
2153                                         {
2154                                                 for (j = 0; j < replace_size; ++j)
2155                                                         array[i + j] = replace[j];
2156                                                 i += replace_size - 1;
2157                                                 pos += other_size - 1;
2158                                                 continue;
2159                                         }
2160                                 }
2161
2162                                 // No match found, just copy characters.
2163                                 array[i - 1] = array[pos];
2164                         }
2165                         array[i] = 0;
2166                         used = i;
2167
2168                         return *this;
2169                 }
2170
2171                 // We are going to be adding characters, so the string size will increase.
2172                 // Count the number of times toReplace exists in the string so we can allocate the new size.
2173                 u32 find_count = 0;
2174                 s32 pos = 0;
2175                 while ((pos = find_raw(other, pos)) != -1)
2176                 {
2177                         ++find_count;
2178                         ++pos;
2179                 }
2180
2181                 // Re-allocate the string now, if needed.
2182                 u32 len = delta * find_count;
2183                 if (used + len >= allocated)
2184                         reallocate(used + len);
2185
2186                 // Start replacing.
2187                 pos = 0;
2188                 while ((pos = find_raw(other, pos)) != -1)
2189                 {
2190                         uchar16_t* start = array + pos + other_size - 1;
2191                         uchar16_t* ptr   = array + used;
2192                         uchar16_t* end   = array + used + delta;
2193
2194                         // Shift characters to make room for the string.
2195                         while (ptr != start)
2196                         {
2197                                 *end = *ptr;
2198                                 --ptr;
2199                                 --end;
2200                         }
2201
2202                         // Add the new string now.
2203                         for (u32 i = 0; i < replace_size; ++i)
2204                                 array[pos + i] = replace[i];
2205
2206                         pos += replace_size;
2207                         used += delta;
2208                 }
2209
2210                 // Terminate the string and return ourself.
2211                 array[used] = 0;
2212                 return *this;
2213         }
2214
2215
2216         //! Removes characters from a ustring16..
2217         //! \param c The character to remove.
2218         //! \return A reference to our current string.
2219         ustring16& remove(uchar32_t c)
2220         {
2221                 u32 pos = 0;
2222                 u32 found = 0;
2223                 u32 len = (c > 0xFFFF ? 2 : 1);         // Remove characters equal to the size of c as a UTF-16 character.
2224                 for (u32 i=0; i<=used; ++i)
2225                 {
2226                         uchar32_t uc32 = 0;
2227                         if (!UTF16_IS_SURROGATE_HI(array[i]))
2228                                 uc32 |= array[i];
2229                         else if (i + 1 <= used)
2230                         {
2231                                 // Convert the surrogate pair into a single UTF-32 character.
2232                                 uc32 = unicode::toUTF32(array[i], array[i + 1]);
2233                         }
2234                         u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2235
2236                         if (uc32 == c)
2237                         {
2238                                 found += len;
2239                                 continue;
2240                         }
2241
2242                         array[pos++] = array[i];
2243                         if (len2 == 2)
2244                                 array[pos++] = array[++i];
2245                 }
2246                 used -= found;
2247                 array[used] = 0;
2248                 return *this;
2249         }
2250
2251
2252         //! Removes a ustring16 from the ustring16.
2253         //! \param toRemove The string to remove.
2254         //! \return A reference to our current string.
2255         ustring16& remove(const ustring16& toRemove)
2256         {
2257                 u32 size = toRemove.size_raw();
2258                 if (size == 0) return *this;
2259
2260                 const uchar16_t* tra = toRemove.c_str();
2261                 u32 pos = 0;
2262                 u32 found = 0;
2263                 for (u32 i=0; i<=used; ++i)
2264                 {
2265                         u32 j = 0;
2266                         while (j < size)
2267                         {
2268                                 if (array[i + j] != tra[j])
2269                                         break;
2270                                 ++j;
2271                         }
2272                         if (j == size)
2273                         {
2274                                 found += size;
2275                                 i += size - 1;
2276                                 continue;
2277                         }
2278
2279                         array[pos++] = array[i];
2280                 }
2281                 used -= found;
2282                 array[used] = 0;
2283                 return *this;
2284         }
2285
2286
2287         //! Removes characters from the ustring16.
2288         //! \param characters The characters to remove.
2289         //! \return A reference to our current string.
2290         ustring16& removeChars(const ustring16& characters)
2291         {
2292                 if (characters.size_raw() == 0)
2293                         return *this;
2294
2295                 u32 pos = 0;
2296                 u32 found = 0;
2297                 const_iterator iter(characters);
2298                 for (u32 i=0; i<=used; ++i)
2299                 {
2300                         uchar32_t uc32 = 0;
2301                         if (!UTF16_IS_SURROGATE_HI(array[i]))
2302                                 uc32 |= array[i];
2303                         else if (i + 1 <= used)
2304                         {
2305                                 // Convert the surrogate pair into a single UTF-32 character.
2306                                 uc32 = unicode::toUTF32(array[i], array[i+1]);
2307                         }
2308                         u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2309
2310                         bool cont = false;
2311                         iter.toStart();
2312                         while (!iter.atEnd())
2313                         {
2314                                 uchar32_t c = *iter;
2315                                 if (uc32 == c)
2316                                 {
2317                                         found += (c > 0xFFFF ? 2 : 1);          // Remove characters equal to the size of c as a UTF-16 character.
2318                                         ++i;
2319                                         cont = true;
2320                                         break;
2321                                 }
2322                                 ++iter;
2323                         }
2324                         if (cont) continue;
2325
2326                         array[pos++] = array[i];
2327                         if (len2 == 2)
2328                                 array[pos++] = array[++i];
2329                 }
2330                 used -= found;
2331                 array[used] = 0;
2332                 return *this;
2333         }
2334
2335
2336         //! Trims the ustring16.
2337         //! Removes the specified characters (by default, Latin-1 whitespace) from the begining and the end of the ustring16.
2338         //! \param whitespace The characters that are to be considered as whitespace.
2339         //! \return A reference to our current string.
2340         ustring16& trim(const ustring16& whitespace = " \t\n\r")
2341         {
2342                 core::array<uchar32_t> utf32white = whitespace.toUTF32();
2343
2344                 // find start and end of the substring without the specified characters
2345                 const s32 begin = findFirstCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2346                 if (begin == -1)
2347                         return (*this="");
2348
2349                 const s32 end = findLastCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2350
2351                 return (*this = subString(begin, (end +1) - begin));
2352         }
2353
2354
2355         //! Erases a character from the ustring16.
2356         //! May be slow, because all elements following after the erased element have to be copied.
2357         //! \param index Index of element to be erased.
2358         //! \return A reference to our current string.
2359         ustring16& erase(u32 index)
2360         {
2361                 _IRR_DEBUG_BREAK_IF(index>used) // access violation
2362
2363                 iterator i(*this, index);
2364
2365                 uchar32_t t = *i;
2366                 u32 len = (t > 0xFFFF ? 2 : 1);
2367
2368                 for (u32 j = static_cast<u32>(i.getPos()) + len; j <= used; ++j)
2369                         array[j - len] = array[j];
2370
2371                 used -= len;
2372                 array[used] = 0;
2373
2374                 return *this;
2375         }
2376
2377
2378         //! Validate the existing ustring16, checking for valid surrogate pairs and checking for proper termination.
2379         //! \return A reference to our current string.
2380         ustring16& validate()
2381         {
2382                 // Validate all unicode characters.
2383                 for (u32 i=0; i<allocated; ++i)
2384                 {
2385                         // Terminate on existing null.
2386                         if (array[i] == 0)
2387                         {
2388                                 used = i;
2389                                 return *this;
2390                         }
2391                         if (UTF16_IS_SURROGATE(array[i]))
2392                         {
2393                                 if (((i+1) >= allocated) || UTF16_IS_SURROGATE_LO(array[i]))
2394                                         array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2395                                 else if (UTF16_IS_SURROGATE_HI(array[i]) && !UTF16_IS_SURROGATE_LO(array[i+1]))
2396                                         array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2397                                 ++i;
2398                         }
2399                         if (array[i] >= 0xFDD0 && array[i] <= 0xFDEF)
2400                                 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2401                 }
2402
2403                 // terminate
2404                 used = 0;
2405                 if (allocated > 0)
2406                 {
2407                         used = allocated - 1;
2408                         array[used] = 0;
2409                 }
2410                 return *this;
2411         }
2412
2413
2414         //! Gets the last char of the ustring16, or 0.
2415         //! \return The last char of the ustring16, or 0.
2416         uchar32_t lastChar() const
2417         {
2418                 if (used < 1)
2419                         return 0;
2420
2421                 if (UTF16_IS_SURROGATE_LO(array[used-1]))
2422                 {
2423                         // Make sure we have a paired surrogate.
2424                         if (used < 2)
2425                                 return 0;
2426
2427                         // Check for an invalid surrogate.
2428                         if (!UTF16_IS_SURROGATE_HI(array[used-2]))
2429                                 return 0;
2430
2431                         // Convert the surrogate pair into a single UTF-32 character.
2432                         return unicode::toUTF32(array[used-2], array[used-1]);
2433                 }
2434                 else
2435                 {
2436                         return array[used-1];
2437                 }
2438         }
2439
2440
2441         //! Split the ustring16 into parts.
2442         /** This method will split a ustring16 at certain delimiter characters
2443         into the container passed in as reference. The type of the container
2444         has to be given as template parameter. It must provide a push_back and
2445         a size method.
2446         \param ret The result container
2447         \param c C-style ustring16 of delimiter characters
2448         \param count Number of delimiter characters
2449         \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2450         container. If two delimiters occur without a character in between, an
2451         empty substring would be placed in the result. If this flag is set,
2452         only non-empty strings are stored.
2453         \param keepSeparators Flag which allows to add the separator to the
2454         result ustring16. If this flag is true, the concatenation of the
2455         substrings results in the original ustring16. Otherwise, only the
2456         characters between the delimiters are returned.
2457         \return The number of resulting substrings
2458         */
2459         template<class container>
2460         u32 split(container& ret, const uchar32_t* const c, u32 count=1, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2461         {
2462                 if (!c)
2463                         return 0;
2464
2465                 const_iterator i(*this);
2466                 const u32 oldSize=ret.size();
2467                 u32 pos = 0;
2468                 u32 lastpos = 0;
2469                 u32 lastpospos = 0;
2470                 bool lastWasSeparator = false;
2471                 while (!i.atEnd())
2472                 {
2473                         uchar32_t ch = *i;
2474                         bool foundSeparator = false;
2475                         for (u32 j=0; j<count; ++j)
2476                         {
2477                                 if (ch == c[j])
2478                                 {
2479                                         if ((!ignoreEmptyTokens || pos - lastpos != 0) &&
2480                                                         !lastWasSeparator)
2481                                         ret.push_back(ustring16(&array[lastpospos], pos - lastpos));
2482                                         foundSeparator = true;
2483                                         lastpos = (keepSeparators ? pos : pos + 1);
2484                                         lastpospos = (keepSeparators ? i.getPos() : i.getPos() + 1);
2485                                         break;
2486                                 }
2487                         }
2488                         lastWasSeparator = foundSeparator;
2489                         ++pos;
2490                         ++i;
2491                 }
2492                 u32 s = size() + 1;
2493                 if (s > lastpos)
2494                         ret.push_back(ustring16(&array[lastpospos], s - lastpos));
2495                 return ret.size()-oldSize;
2496         }
2497
2498
2499         //! Split the ustring16 into parts.
2500         /** This method will split a ustring16 at certain delimiter characters
2501         into the container passed in as reference. The type of the container
2502         has to be given as template parameter. It must provide a push_back and
2503         a size method.
2504         \param ret The result container
2505         \param c A unicode string of delimiter characters
2506         \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2507         container. If two delimiters occur without a character in between, an
2508         empty substring would be placed in the result. If this flag is set,
2509         only non-empty strings are stored.
2510         \param keepSeparators Flag which allows to add the separator to the
2511         result ustring16. If this flag is true, the concatenation of the
2512         substrings results in the original ustring16. Otherwise, only the
2513         characters between the delimiters are returned.
2514         \return The number of resulting substrings
2515         */
2516         template<class container>
2517         u32 split(container& ret, const ustring16& c, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2518         {
2519                 core::array<uchar32_t> v = c.toUTF32();
2520                 return split(ret, v.pointer(), v.size(), ignoreEmptyTokens, keepSeparators);
2521         }
2522
2523
2524         //! Gets the size of the allocated memory buffer for the string.
2525         //! \return The size of the allocated memory buffer.
2526         u32 capacity() const
2527         {
2528                 return allocated;
2529         }
2530
2531
2532         //! Returns the raw number of UTF-16 code points in the string which includes the individual surrogates.
2533         //! \return The raw number of UTF-16 code points, excluding the trialing NUL.
2534         u32 size_raw() const
2535         {
2536                 return used;
2537         }
2538
2539
2540         //! Inserts a character into the string.
2541         //! \param c The character to insert.
2542         //! \param pos The position to insert the character.
2543         //! \return A reference to our current string.
2544         ustring16& insert(uchar32_t c, u32 pos)
2545         {
2546                 u8 len = (c > 0xFFFF ? 2 : 1);
2547
2548                 if (used + len >= allocated)
2549                         reallocate(used + len);
2550
2551                 used += len;
2552
2553                 iterator iter(*this, pos);
2554                 for (u32 i = used - 2; i > iter.getPos(); --i)
2555                         array[i] = array[i - len];
2556
2557                 if (c > 0xFFFF)
2558                 {
2559                         // c will be multibyte, so split it up into a surrogate pair.
2560                         uchar16_t x = static_cast<uchar16_t>(c);
2561                         uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
2562                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
2563                         array[iter.getPos()] = vh;
2564                         array[iter.getPos()+1] = vl;
2565                 }
2566                 else
2567                 {
2568                         array[iter.getPos()] = static_cast<uchar16_t>(c);
2569                 }
2570                 array[used] = 0;
2571                 return *this;
2572         }
2573
2574
2575         //! Inserts a string into the string.
2576         //! \param c The string to insert.
2577         //! \param pos The position to insert the string.
2578         //! \return A reference to our current string.
2579         ustring16& insert(const ustring16& c, u32 pos)
2580         {
2581                 u32 len = c.size_raw();
2582                 if (len == 0) return *this;
2583
2584                 if (used + len >= allocated)
2585                         reallocate(used + len);
2586
2587                 used += len;
2588
2589                 iterator iter(*this, pos);
2590                 for (u32 i = used - 2; i > iter.getPos() + len; --i)
2591                         array[i] = array[i - len];
2592
2593                 const uchar16_t* s = c.c_str();
2594                 for (u32 i = 0; i < len; ++i)
2595                 {
2596                         array[pos++] = *s;
2597                         ++s;
2598                 }
2599
2600                 array[used] = 0;
2601                 return *this;
2602         }
2603
2604
2605         //! Inserts a character into the string.
2606         //! \param c The character to insert.
2607         //! \param pos The position to insert the character.
2608         //! \return A reference to our current string.
2609         ustring16& insert_raw(uchar16_t c, u32 pos)
2610         {
2611                 if (used + 1 >= allocated)
2612                         reallocate(used + 1);
2613
2614                 ++used;
2615
2616                 for (u32 i = used - 1; i > pos; --i)
2617                         array[i] = array[i - 1];
2618
2619                 array[pos] = c;
2620                 array[used] = 0;
2621                 return *this;
2622         }
2623
2624
2625         //! Removes a character from string.
2626         //! \param pos Position of the character to remove.
2627         //! \return A reference to our current string.
2628         ustring16& erase_raw(u32 pos)
2629         {
2630                 for (u32 i=pos; i<=used; ++i)
2631                 {
2632                         array[i] = array[i + 1];
2633                 }
2634                 --used;
2635                 array[used] = 0;
2636                 return *this;
2637         }
2638
2639
2640         //! Replaces a character in the string.
2641         //! \param c The new character.
2642         //! \param pos The position of the character to replace.
2643         //! \return A reference to our current string.
2644         ustring16& replace_raw(uchar16_t c, u32 pos)
2645         {
2646                 array[pos] = c;
2647                 return *this;
2648         }
2649
2650
2651         //! Returns an iterator to the beginning of the string.
2652         //! \return An iterator to the beginning of the string.
2653         iterator begin()
2654         {
2655                 iterator i(*this, 0);
2656                 return i;
2657         }
2658
2659
2660         //! Returns an iterator to the beginning of the string.
2661         //! \return An iterator to the beginning of the string.
2662         const_iterator begin() const
2663         {
2664                 const_iterator i(*this, 0);
2665                 return i;
2666         }
2667
2668
2669         //! Returns an iterator to the beginning of the string.
2670         //! \return An iterator to the beginning of the string.
2671         const_iterator cbegin() const
2672         {
2673                 const_iterator i(*this, 0);
2674                 return i;
2675         }
2676
2677
2678         //! Returns an iterator to the end of the string.
2679         //! \return An iterator to the end of the string.
2680         iterator end()
2681         {
2682                 iterator i(*this, 0);
2683                 i.toEnd();
2684                 return i;
2685         }
2686
2687
2688         //! Returns an iterator to the end of the string.
2689         //! \return An iterator to the end of the string.
2690         const_iterator end() const
2691         {
2692                 const_iterator i(*this, 0);
2693                 i.toEnd();
2694                 return i;
2695         }
2696
2697
2698         //! Returns an iterator to the end of the string.
2699         //! \return An iterator to the end of the string.
2700         const_iterator cend() const
2701         {
2702                 const_iterator i(*this, 0);
2703                 i.toEnd();
2704                 return i;
2705         }
2706
2707
2708         //! Converts the string to a UTF-8 encoded string.
2709         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2710         //! \return A string containing the UTF-8 encoded string.
2711         core::string<uchar8_t> toUTF8_s(const bool addBOM = false) const
2712         {
2713                 core::string<uchar8_t> ret;
2714                 ret.reserve(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2715                 const_iterator iter(*this, 0);
2716
2717                 // Add the byte order mark if the user wants it.
2718                 if (addBOM)
2719                 {
2720                         ret.append(unicode::BOM_ENCODE_UTF8[0]);
2721                         ret.append(unicode::BOM_ENCODE_UTF8[1]);
2722                         ret.append(unicode::BOM_ENCODE_UTF8[2]);
2723                 }
2724
2725                 while (!iter.atEnd())
2726                 {
2727                         uchar32_t c = *iter;
2728                         if (c > 0xFFFF)
2729                         {       // 4 bytes
2730                                 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2731                                 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2732                                 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2733                                 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2734                                 ret.append(b1);
2735                                 ret.append(b2);
2736                                 ret.append(b3);
2737                                 ret.append(b4);
2738                         }
2739                         else if (c > 0x7FF)
2740                         {       // 3 bytes
2741                                 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2742                                 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2743                                 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2744                                 ret.append(b1);
2745                                 ret.append(b2);
2746                                 ret.append(b3);
2747                         }
2748                         else if (c > 0x7F)
2749                         {       // 2 bytes
2750                                 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2751                                 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2752                                 ret.append(b1);
2753                                 ret.append(b2);
2754                         }
2755                         else
2756                         {       // 1 byte
2757                                 ret.append(static_cast<uchar8_t>(c));
2758                         }
2759                         ++iter;
2760                 }
2761                 return ret;
2762         }
2763
2764
2765         //! Converts the string to a UTF-8 encoded string array.
2766         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2767         //! \return An array containing the UTF-8 encoded string.
2768         core::array<uchar8_t> toUTF8(const bool addBOM = false) const
2769         {
2770                 core::array<uchar8_t> ret(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2771                 const_iterator iter(*this, 0);
2772
2773                 // Add the byte order mark if the user wants it.
2774                 if (addBOM)
2775                 {
2776                         ret.push_back(unicode::BOM_ENCODE_UTF8[0]);
2777                         ret.push_back(unicode::BOM_ENCODE_UTF8[1]);
2778                         ret.push_back(unicode::BOM_ENCODE_UTF8[2]);
2779                 }
2780
2781                 while (!iter.atEnd())
2782                 {
2783                         uchar32_t c = *iter;
2784                         if (c > 0xFFFF)
2785                         {       // 4 bytes
2786                                 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2787                                 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2788                                 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2789                                 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2790                                 ret.push_back(b1);
2791                                 ret.push_back(b2);
2792                                 ret.push_back(b3);
2793                                 ret.push_back(b4);
2794                         }
2795                         else if (c > 0x7FF)
2796                         {       // 3 bytes
2797                                 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2798                                 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2799                                 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2800                                 ret.push_back(b1);
2801                                 ret.push_back(b2);
2802                                 ret.push_back(b3);
2803                         }
2804                         else if (c > 0x7F)
2805                         {       // 2 bytes
2806                                 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2807                                 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2808                                 ret.push_back(b1);
2809                                 ret.push_back(b2);
2810                         }
2811                         else
2812                         {       // 1 byte
2813                                 ret.push_back(static_cast<uchar8_t>(c));
2814                         }
2815                         ++iter;
2816                 }
2817                 ret.push_back(0);
2818                 return ret;
2819         }
2820
2821
2822         //! Converts the string to a UTF-16 encoded string array.
2823         //! Unfortunately, no toUTF16_s() version exists due to limitations with Irrlicht's string class.
2824         //! \param endian The desired endianness of the string.
2825         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2826         //! \return An array containing the UTF-16 encoded string.
2827         core::array<uchar16_t> toUTF16(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2828         {
2829                 core::array<uchar16_t> ret(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2830                 uchar16_t* ptr = ret.pointer();
2831
2832                 // Add the BOM if specified.
2833                 if (addBOM)
2834                 {
2835                         if (endian == unicode::EUTFEE_NATIVE)
2836                                 *ptr = unicode::BOM;
2837                         else if (endian == unicode::EUTFEE_LITTLE)
2838                         {
2839                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2840                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2841                                 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2842                         }
2843                         else
2844                         {
2845                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2846                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2847                                 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2848                         }
2849                         ++ptr;
2850                 }
2851
2852                 memcpy((void*)ptr, (void*)array, used * sizeof(uchar16_t));
2853                 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2854                 {
2855                         for (u32 i = 0; i <= used; ++i)
2856                                 ptr[i] = unicode::swapEndian16(ptr[i]);
2857                 }
2858                 ret.set_used(used + (addBOM ? unicode::BOM_UTF16_LEN : 0));
2859                 ret.push_back(0);
2860                 return ret;
2861         }
2862
2863
2864         //! Converts the string to a UTF-32 encoded string array.
2865         //! Unfortunately, no toUTF32_s() version exists due to limitations with Irrlicht's string class.
2866         //! \param endian The desired endianness of the string.
2867         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2868         //! \return An array containing the UTF-32 encoded string.
2869         core::array<uchar32_t> toUTF32(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2870         {
2871                 core::array<uchar32_t> ret(size() + (addBOM ? unicode::BOM_UTF32_LEN : 0) + 1);
2872                 const_iterator iter(*this, 0);
2873
2874                 // Add the BOM if specified.
2875                 if (addBOM)
2876                 {
2877                         if (endian == unicode::EUTFEE_NATIVE)
2878                                 ret.push_back(unicode::BOM);
2879                         else
2880                         {
2881                                 union
2882                                 {
2883                                         uchar32_t full;
2884                                         u8 chunk[4];
2885                                 } t;
2886
2887                                 if (endian == unicode::EUTFEE_LITTLE)
2888                                 {
2889                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
2890                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
2891                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
2892                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
2893                                 }
2894                                 else
2895                                 {
2896                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
2897                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
2898                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
2899                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
2900                                 }
2901                                 ret.push_back(t.full);
2902                         }
2903                 }
2904                 ret.push_back(0);
2905
2906                 while (!iter.atEnd())
2907                 {
2908                         uchar32_t c = *iter;
2909                         if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2910                                 c = unicode::swapEndian32(c);
2911                         ret.push_back(c);
2912                         ++iter;
2913                 }
2914                 return ret;
2915         }
2916
2917
2918         //! Converts the string to a wchar_t encoded string.
2919         /** The size of a wchar_t changes depending on the platform.  This function will store a
2920         correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
2921         //! \param endian The desired endianness of the string.
2922         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2923         //! \return A string containing the wchar_t encoded string.
2924         core::string<wchar_t> toWCHAR_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2925         {
2926                 if (sizeof(wchar_t) == 4)
2927                 {
2928                         core::array<uchar32_t> a(toUTF32(endian, addBOM));
2929                         core::stringw ret(a.pointer());
2930                         return ret;
2931                 }
2932                 else if (sizeof(wchar_t) == 2)
2933                 {
2934                         if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
2935                         {
2936                                 core::stringw ret(array);
2937                                 return ret;
2938                         }
2939                         else
2940                         {
2941                                 core::array<uchar16_t> a(toUTF16(endian, addBOM));
2942                                 core::stringw ret(a.pointer());
2943                                 return ret;
2944                         }
2945                 }
2946                 else if (sizeof(wchar_t) == 1)
2947                 {
2948                         core::array<uchar8_t> a(toUTF8(addBOM));
2949                         core::stringw ret(a.pointer());
2950                         return ret;
2951                 }
2952
2953                 // Shouldn't happen.
2954                 return core::stringw();
2955         }
2956
2957
2958         //! Converts the string to a wchar_t encoded string array.
2959         /** The size of a wchar_t changes depending on the platform.  This function will store a
2960         correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
2961         //! \param endian The desired endianness of the string.
2962         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2963         //! \return An array containing the wchar_t encoded string.
2964         core::array<wchar_t> toWCHAR(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2965         {
2966                 if (sizeof(wchar_t) == 4)
2967                 {
2968                         core::array<uchar32_t> a(toUTF32(endian, addBOM));
2969                         core::array<wchar_t> ret(a.size());
2970                         ret.set_used(a.size());
2971                         memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar32_t));
2972                         return ret;
2973                 }
2974                 if (sizeof(wchar_t) == 2)
2975                 {
2976                         if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
2977                         {
2978                                 core::array<wchar_t> ret(used);
2979                                 ret.set_used(used);
2980                                 memcpy((void*)ret.pointer(), (void*)array, used * sizeof(uchar16_t));
2981                                 return ret;
2982                         }
2983                         else
2984                         {
2985                                 core::array<uchar16_t> a(toUTF16(endian, addBOM));
2986                                 core::array<wchar_t> ret(a.size());
2987                                 ret.set_used(a.size());
2988                                 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar16_t));
2989                                 return ret;
2990                         }
2991                 }
2992                 if (sizeof(wchar_t) == 1)
2993                 {
2994                         core::array<uchar8_t> a(toUTF8(addBOM));
2995                         core::array<wchar_t> ret(a.size());
2996                         ret.set_used(a.size());
2997                         memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar8_t));
2998                         return ret;
2999                 }
3000
3001                 // Shouldn't happen.
3002                 return core::array<wchar_t>();
3003         }
3004
3005         //! Converts the string to a properly encoded io::path string.
3006         //! \param endian The desired endianness of the string.
3007         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3008         //! \return An io::path string containing the properly encoded string.
3009         io::path toPATH_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3010         {
3011                 return toUTF8_s(addBOM);
3012         }
3013
3014         //! Loads an unknown stream of data.
3015         //! Will attempt to determine if the stream is unicode data.  Useful for loading from files.
3016         //! \param data The data stream to load from.
3017         //! \param data_size The length of the data string.
3018         //! \return A reference to our current string.
3019         ustring16& loadDataStream(const char* data, size_t data_size)
3020         {
3021                 // Clear our string.
3022                 *this = "";
3023                 if (!data)
3024                         return *this;
3025
3026                 unicode::EUTF_ENCODE e = unicode::determineUnicodeBOM(data);
3027                 switch (e)
3028                 {
3029                         default:
3030                         case unicode::EUTFE_UTF8:
3031                                 append((uchar8_t*)data, data_size);
3032                                 break;
3033
3034                         case unicode::EUTFE_UTF16:
3035                         case unicode::EUTFE_UTF16_BE:
3036                         case unicode::EUTFE_UTF16_LE:
3037                                 append((uchar16_t*)data, data_size / 2);
3038                                 break;
3039
3040                         case unicode::EUTFE_UTF32:
3041                         case unicode::EUTFE_UTF32_BE:
3042                         case unicode::EUTFE_UTF32_LE:
3043                                 append((uchar32_t*)data, data_size / 4);
3044                                 break;
3045                 }
3046
3047                 return *this;
3048         }
3049
3050         //! Gets the encoding of the Unicode string this class contains.
3051         //! \return An enum describing the current encoding of this string.
3052         unicode::EUTF_ENCODE getEncoding() const
3053         {
3054                 return encoding;
3055         }
3056
3057         //! Gets the endianness of the Unicode string this class contains.
3058         //! \return An enum describing the endianness of this string.
3059         unicode::EUTF_ENDIAN getEndianness() const
3060         {
3061                 if (encoding == unicode::EUTFE_UTF16_LE ||
3062                         encoding == unicode::EUTFE_UTF32_LE)
3063                         return unicode::EUTFEE_LITTLE;
3064                 else return unicode::EUTFEE_BIG;
3065         }
3066
3067 private:
3068
3069         //! Reallocate the string, making it bigger or smaller.
3070         //! \param new_size The new size of the string.
3071         void reallocate(u32 new_size)
3072         {
3073                 uchar16_t* old_array = array;
3074
3075                 array = new uchar16_t[new_size + 1];
3076                 allocated = new_size + 1;
3077                 if (old_array == 0) return;
3078
3079                 u32 amount = used < new_size ? used : new_size;
3080                 for (u32 i=0; i<=amount; ++i)
3081                         array[i] = old_array[i];
3082
3083                 if (allocated <= used)
3084                         used = allocated - 1;
3085
3086                 array[used] = 0;
3087
3088                 delete [] old_array;
3089         }
3090
3091         //--- member variables
3092
3093         uchar16_t* array;
3094         unicode::EUTF_ENCODE encoding;
3095         u32 allocated;
3096         u32 used;
3097 };
3098
3099 typedef ustring16 ustring;
3100
3101
3102 /* these cause ambigous overloads errors and don't seem to be actually in use */
3103 #if 0
3104 //! Appends two ustring16s.
3105 inline ustring16 operator+(const ustring16& left, const ustring16& right)
3106 {
3107         ustring16 ret(left);
3108         ret += right;
3109         return ret;
3110 }
3111
3112
3113 //! Appends a ustring16 and a null-terminated unicode string.
3114 template <class B>
3115 inline ustring16 operator+(const ustring16& left, const B* const right)
3116 {
3117         ustring16 ret(left);
3118         ret += right;
3119         return ret;
3120 }
3121
3122
3123 //! Appends a ustring16 and a null-terminated unicode string.
3124 template <class B>
3125 inline ustring16 operator+(const B* const left, const ustring16& right)
3126 {
3127         ustring16 ret(left);
3128         ret += right;
3129         return ret;
3130 }
3131
3132
3133 //! Appends a ustring16 and an Irrlicht string.
3134 template <typename B>
3135 inline ustring16 operator+(const ustring16& left, const string<B>& right)
3136 {
3137         ustring16 ret(left);
3138         ret += right;
3139         return ret;
3140 }
3141
3142
3143 //! Appends a ustring16 and an Irrlicht string.
3144 template <typename B>
3145 inline ustring16 operator+(const string<B>& left, const ustring16& right)
3146 {
3147         ustring16 ret(left);
3148         ret += right;
3149         return ret;
3150 }
3151
3152
3153 //! Appends a ustring16 and a std::basic_string.
3154 template <typename B, typename A, typename BAlloc>
3155 inline ustring16 operator+(const ustring16& left, const std::basic_string<B, A, BAlloc>& right)
3156 {
3157         ustring16 ret(left);
3158         ret += right;
3159         return ret;
3160 }
3161
3162
3163 //! Appends a ustring16 and a std::basic_string.
3164 template <typename B, typename A, typename BAlloc>
3165 inline ustring16 operator+(const std::basic_string<B, A, BAlloc>& left, const ustring16& right)
3166 {
3167         ustring16 ret(left);
3168         ret += right;
3169         return ret;
3170 }
3171
3172
3173 //! Appends a ustring16 and a char.
3174 inline ustring16 operator+(const ustring16& left, const char right)
3175 {
3176         ustring16 ret(left);
3177         ret += right;
3178         return ret;
3179 }
3180
3181
3182 //! Appends a ustring16 and a char.
3183 inline ustring16 operator+(const char left, const ustring16& right)
3184 {
3185         ustring16 ret(left);
3186         ret += right;
3187         return ret;
3188 }
3189
3190
3191 //! Appends a ustring16 and a uchar32_t.
3192 inline ustring16 operator+(const ustring16& left, const uchar32_t right)
3193 {
3194         ustring16 ret(left);
3195         ret += right;
3196         return ret;
3197 }
3198
3199
3200 //! Appends a ustring16 and a uchar32_t.
3201 inline ustring16 operator+(const uchar32_t left, const ustring16& right)
3202 {
3203         ustring16 ret(left);
3204         ret += right;
3205         return ret;
3206 }
3207
3208
3209 //! Appends a ustring16 and a short.
3210 inline ustring16 operator+(const ustring16& left, const short right)
3211 {
3212         ustring16 ret(left);
3213         ret += core::stringc(right);
3214         return ret;
3215 }
3216
3217
3218 //! Appends a ustring16 and a short.
3219 inline ustring16 operator+(const short left, const ustring16& right)
3220 {
3221         ustring16 ret((core::stringc(left)));
3222         ret += right;
3223         return ret;
3224 }
3225
3226
3227 //! Appends a ustring16 and an unsigned short.
3228 inline ustring16 operator+(const ustring16& left, const unsigned short right)
3229 {
3230         ustring16 ret(left);
3231         ret += core::stringc(right);
3232         return ret;
3233 }
3234
3235
3236 //! Appends a ustring16 and an unsigned short.
3237 inline ustring16 operator+(const unsigned short left, const ustring16& right)
3238 {
3239         ustring16 ret((core::stringc(left)));
3240         ret += right;
3241         return ret;
3242 }
3243
3244
3245 //! Appends a ustring16 and an int.
3246 inline ustring16 operator+(const ustring16& left, const int right)
3247 {
3248         ustring16 ret(left);
3249         ret += core::stringc(right);
3250         return ret;
3251 }
3252
3253
3254 //! Appends a ustring16 and an int.
3255 inline ustring16 operator+(const int left, const ustring16& right)
3256 {
3257         ustring16 ret((core::stringc(left)));
3258         ret += right;
3259         return ret;
3260 }
3261
3262
3263 //! Appends a ustring16 and an unsigned int.
3264 inline ustring16 operator+(const ustring16& left, const unsigned int right)
3265 {
3266         ustring16 ret(left);
3267         ret += core::stringc(right);
3268         return ret;
3269 }
3270
3271
3272 //! Appends a ustring16 and an unsigned int.
3273 inline ustring16 operator+(const unsigned int left, const ustring16& right)
3274 {
3275         ustring16 ret((core::stringc(left)));
3276         ret += right;
3277         return ret;
3278 }
3279
3280
3281 //! Appends a ustring16 and a long.
3282 inline ustring16 operator+(const ustring16& left, const long right)
3283 {
3284         ustring16 ret(left);
3285         ret += core::stringc(right);
3286         return ret;
3287 }
3288
3289
3290 //! Appends a ustring16 and a long.
3291 inline ustring16 operator+(const long left, const ustring16& right)
3292 {
3293         ustring16 ret((core::stringc(left)));
3294         ret += right;
3295         return ret;
3296 }
3297
3298
3299 //! Appends a ustring16 and an unsigned long.
3300 inline ustring16 operator+(const ustring16& left, const unsigned long right)
3301 {
3302         ustring16 ret(left);
3303         ret += core::stringc(right);
3304         return ret;
3305 }
3306
3307
3308 //! Appends a ustring16 and an unsigned long.
3309 inline ustring16 operator+(const unsigned long left, const ustring16& right)
3310 {
3311         ustring16 ret((core::stringc(left)));
3312         ret += right;
3313         return ret;
3314 }
3315
3316
3317 //! Appends a ustring16 and a float.
3318 inline ustring16 operator+(const ustring16& left, const float right)
3319 {
3320         ustring16 ret(left);
3321         ret += core::stringc(right);
3322         return ret;
3323 }
3324
3325
3326 //! Appends a ustring16 and a float.
3327 inline ustring16 operator+(const float left, const ustring16& right)
3328 {
3329         ustring16 ret((core::stringc(left)));
3330         ret += right;
3331         return ret;
3332 }
3333
3334
3335 //! Appends a ustring16 and a double.
3336 inline ustring16 operator+(const ustring16& left, const double right)
3337 {
3338         ustring16 ret(left);
3339         ret += core::stringc(right);
3340         return ret;
3341 }
3342
3343
3344 //! Appends a ustring16 and a double.
3345 inline ustring16 operator+(const double left, const ustring16& right)
3346 {
3347         ustring16 ret((core::stringc(left)));
3348         ret += right;
3349         return ret;
3350 }
3351
3352
3353 //! Appends two ustring16s.
3354 inline ustring16&& operator+(const ustring16& left, ustring16&& right)
3355 {
3356         right.insert(left, 0);
3357         return std::move(right);
3358 }
3359
3360
3361 //! Appends two ustring16s.
3362 inline ustring16&& operator+(ustring16&& left, const ustring16& right)
3363 {
3364         left.append(right);
3365         return std::move(left);
3366 }
3367
3368
3369 //! Appends two ustring16s.
3370 inline ustring16&& operator+(ustring16&& left, ustring16&& right)
3371 {
3372         if ((right.size_raw() <= left.capacity() - left.size_raw()) ||
3373                 (right.capacity() - right.size_raw() < left.size_raw()))
3374         {
3375                 left.append(right);
3376                 return std::move(left);
3377         }
3378         else
3379         {
3380                 right.insert(left, 0);
3381                 return std::move(right);
3382         }
3383 }
3384
3385
3386 //! Appends a ustring16 and a null-terminated unicode string.
3387 template <class B>
3388 inline ustring16&& operator+(ustring16&& left, const B* const right)
3389 {
3390         left.append(right);
3391         return std::move(left);
3392 }
3393
3394
3395 //! Appends a ustring16 and a null-terminated unicode string.
3396 template <class B>
3397 inline ustring16&& operator+(const B* const left, ustring16&& right)
3398 {
3399         right.insert(left, 0);
3400         return std::move(right);
3401 }
3402
3403
3404 //! Appends a ustring16 and an Irrlicht string.
3405 template <typename B>
3406 inline ustring16&& operator+(const string<B>& left, ustring16&& right)
3407 {
3408         right.insert(left, 0);
3409         return std::move(right);
3410 }
3411
3412
3413 //! Appends a ustring16 and an Irrlicht string.
3414 template <typename B>
3415 inline ustring16&& operator+(ustring16&& left, const string<B>& right)
3416 {
3417         left.append(right);
3418         return std::move(left);
3419 }
3420
3421
3422 //! Appends a ustring16 and a std::basic_string.
3423 template <typename B, typename A, typename BAlloc>
3424 inline ustring16&& operator+(const std::basic_string<B, A, BAlloc>& left, ustring16&& right)
3425 {
3426         right.insert(core::ustring16(left), 0);
3427         return std::move(right);
3428 }
3429
3430
3431 //! Appends a ustring16 and a std::basic_string.
3432 template <typename B, typename A, typename BAlloc>
3433 inline ustring16&& operator+(ustring16&& left, const std::basic_string<B, A, BAlloc>& right)
3434 {
3435         left.append(right);
3436         return std::move(left);
3437 }
3438
3439
3440 //! Appends a ustring16 and a char.
3441 inline ustring16 operator+(ustring16&& left, const char right)
3442 {
3443         left.append((uchar32_t)right);
3444         return std::move(left);
3445 }
3446
3447
3448 //! Appends a ustring16 and a char.
3449 inline ustring16 operator+(const char left, ustring16&& right)
3450 {
3451         right.insert((uchar32_t)left, 0);
3452         return std::move(right);
3453 }
3454
3455
3456 //! Appends a ustring16 and a uchar32_t.
3457 inline ustring16 operator+(ustring16&& left, const uchar32_t right)
3458 {
3459         left.append(right);
3460         return std::move(left);
3461 }
3462
3463
3464 //! Appends a ustring16 and a uchar32_t.
3465 inline ustring16 operator+(const uchar32_t left, ustring16&& right)
3466 {
3467         right.insert(left, 0);
3468         return std::move(right);
3469 }
3470
3471
3472 //! Appends a ustring16 and a short.
3473 inline ustring16 operator+(ustring16&& left, const short right)
3474 {
3475         left.append(core::stringc(right));
3476         return std::move(left);
3477 }
3478
3479
3480 //! Appends a ustring16 and a short.
3481 inline ustring16 operator+(const short left, ustring16&& right)
3482 {
3483         right.insert(core::stringc(left), 0);
3484         return std::move(right);
3485 }
3486
3487
3488 //! Appends a ustring16 and an unsigned short.
3489 inline ustring16 operator+(ustring16&& left, const unsigned short right)
3490 {
3491         left.append(core::stringc(right));
3492         return std::move(left);
3493 }
3494
3495
3496 //! Appends a ustring16 and an unsigned short.
3497 inline ustring16 operator+(const unsigned short left, ustring16&& right)
3498 {
3499         right.insert(core::stringc(left), 0);
3500         return std::move(right);
3501 }
3502
3503
3504 //! Appends a ustring16 and an int.
3505 inline ustring16 operator+(ustring16&& left, const int right)
3506 {
3507         left.append(core::stringc(right));
3508         return std::move(left);
3509 }
3510
3511
3512 //! Appends a ustring16 and an int.
3513 inline ustring16 operator+(const int left, ustring16&& right)
3514 {
3515         right.insert(core::stringc(left), 0);
3516         return std::move(right);
3517 }
3518
3519
3520 //! Appends a ustring16 and an unsigned int.
3521 inline ustring16 operator+(ustring16&& left, const unsigned int right)
3522 {
3523         left.append(core::stringc(right));
3524         return std::move(left);
3525 }
3526
3527
3528 //! Appends a ustring16 and an unsigned int.
3529 inline ustring16 operator+(const unsigned int left, ustring16&& right)
3530 {
3531         right.insert(core::stringc(left), 0);
3532         return std::move(right);
3533 }
3534
3535
3536 //! Appends a ustring16 and a long.
3537 inline ustring16 operator+(ustring16&& left, const long right)
3538 {
3539         left.append(core::stringc(right));
3540         return std::move(left);
3541 }
3542
3543
3544 //! Appends a ustring16 and a long.
3545 inline ustring16 operator+(const long left, ustring16&& right)
3546 {
3547         right.insert(core::stringc(left), 0);
3548         return std::move(right);
3549 }
3550
3551
3552 //! Appends a ustring16 and an unsigned long.
3553 inline ustring16 operator+(ustring16&& left, const unsigned long right)
3554 {
3555         left.append(core::stringc(right));
3556         return std::move(left);
3557 }
3558
3559
3560 //! Appends a ustring16 and an unsigned long.
3561 inline ustring16 operator+(const unsigned long left, ustring16&& right)
3562 {
3563         right.insert(core::stringc(left), 0);
3564         return std::move(right);
3565 }
3566
3567
3568 //! Appends a ustring16 and a float.
3569 inline ustring16 operator+(ustring16&& left, const float right)
3570 {
3571         left.append(core::stringc(right));
3572         return std::move(left);
3573 }
3574
3575
3576 //! Appends a ustring16 and a float.
3577 inline ustring16 operator+(const float left, ustring16&& right)
3578 {
3579         right.insert(core::stringc(left), 0);
3580         return std::move(right);
3581 }
3582
3583
3584 //! Appends a ustring16 and a double.
3585 inline ustring16 operator+(ustring16&& left, const double right)
3586 {
3587         left.append(core::stringc(right));
3588         return std::move(left);
3589 }
3590
3591
3592 //! Appends a ustring16 and a double.
3593 inline ustring16 operator+(const double left, ustring16&& right)
3594 {
3595         right.insert(core::stringc(left), 0);
3596         return std::move(right);
3597 }
3598 #endif
3599
3600
3601 //! Writes a ustring16 to an ostream.
3602 inline std::ostream& operator<<(std::ostream& out, const ustring16& in)
3603 {
3604         out << in.toUTF8_s().c_str();
3605         return out;
3606 }
3607
3608 //! Writes a ustring16 to a wostream.
3609 inline std::wostream& operator<<(std::wostream& out, const ustring16& in)
3610 {
3611         out << in.toWCHAR_s().c_str();
3612         return out;
3613 }
3614
3615 } // end namespace core
3616 } // end namespace irr