src/cguittfont/irrUString.h

   1 /*
   2    Basic Unicode string class for Irrlicht.
   3    Copyright (c) 2009-2011 John Norman
   4
   5    This software is provided 'as-is', without any express or implied
   6    warranty. In no event will the authors be held liable for any
   7    damages arising from the use of this software.
   8
   9    Permission is granted to anyone to use this software for any
  10    purpose, including commercial applications, and to alter it and
  11    redistribute it freely, subject to the following restrictions:
  12
  13    1. The origin of this software must not be misrepresented; you
  14       must not claim that you wrote the original software. If you use
  15       this software in a product, an acknowledgment in the product
  16       documentation would be appreciated but is not required.
  17
  18    2. Altered source versions must be plainly marked as such, and
  19       must not be misrepresented as being the original software.
  20
  21    3. This notice may not be removed or altered from any source
  22       distribution.
  23
  24    The original version of this class can be located at:
  25    http://irrlicht.suckerfreegames.com/
  26
  27    John Norman
  28    john@suckerfreegames.com
  29 */
  30
  31 #ifndef __IRR_USTRING_H_INCLUDED__
  32 #define __IRR_USTRING_H_INCLUDED__
  33
  34 #if (__cplusplus > 199711L) || (_MSC_VER >= 1600) || defined(__GXX_EXPERIMENTAL_CXX0X__)
  35 #       define USTRING_CPP0X
  36 #       if defined(__GXX_EXPERIMENTAL_CXX0X__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
  37 #               define USTRING_CPP0X_NEWLITERALS
  38 #       endif
  39 #endif
  40
  41 #include <stdio.h>
  42 #include <string.h>
  43 #include <stdlib.h>
  44 #ifdef _WIN32
  45 #define __BYTE_ORDER 0
  46 #define __LITTLE_ENDIAN 0
  47 #define __BIG_ENDIAN 1
  48 #elif __MACH__
  49 #include <machine/endian.h>
  50 #else
  51 #include <endian.h>
  52 #endif
  53
  54 #ifdef USTRING_CPP0X
  55 #       include <utility>
  56 #endif
  57
  58 #ifndef USTRING_NO_STL
  59 #       include <string>
  60 #       include <iterator>
  61 #       include <ostream>
  62 #endif
  63
  64 #include "irrTypes.h"
  65 #include "irrAllocator.h"
  66 #include "irrArray.h"
  67 #include "irrMath.h"
  68 #include "irrString.h"
  69 #include "path.h"
  70
  71 //! UTF-16 surrogate start values.
  72 static const irr::u16 UTF16_HI_SURROGATE = 0xD800;
  73 static const irr::u16 UTF16_LO_SURROGATE = 0xDC00;
  74
  75 //! Is a UTF-16 code point a surrogate?
  76 #define UTF16_IS_SURROGATE(c)           (((c) & 0xF800) == 0xD800)
  77 #define UTF16_IS_SURROGATE_HI(c)        (((c) & 0xFC00) == 0xD800)
  78 #define UTF16_IS_SURROGATE_LO(c)        (((c) & 0xFC00) == 0xDC00)
  79
  80
  81 namespace irr
  82 {
  83
  84         // Define our character types.
  85 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
  86         typedef char32_t uchar32_t;
  87         typedef char16_t uchar16_t;
  88         typedef char uchar8_t;
  89 #else
  90         typedef u32 uchar32_t;
  91         typedef u16 uchar16_t;
  92         typedef u8 uchar8_t;
  93 #endif
  94
  95 namespace core
  96 {
  97
  98 namespace unicode
  99 {
 100
 101 //! The unicode replacement character.  Used to replace invalid characters.
 102 const irr::u16 UTF_REPLACEMENT_CHARACTER = 0xFFFD;
 103
 104 //! Convert a UTF-16 surrogate pair into a UTF-32 character.
 105 //! \param high The high value of the pair.
 106 //! \param low The low value of the pair.
 107 //! \return The UTF-32 character expressed by the surrogate pair.
 108 inline uchar32_t toUTF32(uchar16_t high, uchar16_t low)
 109 {
 110         // Convert the surrogate pair into a single UTF-32 character.
 111         uchar32_t x = ((high & ((1 << 6) -1)) << 10) | (low & ((1 << 10) -1));
 112         uchar32_t wu = ((high >> 6) & ((1 << 5) - 1)) + 1;
 113         return (wu << 16) | x;
 114 }
 115
 116 //! Swaps the endianness of a 16-bit value.
 117 //! \return The new value.
 118 inline uchar16_t swapEndian16(const uchar16_t& c)
 119 {
 120         return ((c >> 8) & 0x00FF) | ((c << 8) & 0xFF00);
 121 }
 122
 123 //! Swaps the endianness of a 32-bit value.
 124 //! \return The new value.
 125 inline uchar32_t swapEndian32(const uchar32_t& c)
 126 {
 127         return  ((c >> 24) & 0x000000FF) |
 128                         ((c >> 8)  & 0x0000FF00) |
 129                         ((c << 8)  & 0x00FF0000) |
 130                         ((c << 24) & 0xFF000000);
 131 }
 132
 133 //! The Unicode byte order mark.
 134 const u16 BOM = 0xFEFF;
 135
 136 //! The size of the Unicode byte order mark in terms of the Unicode character size.
 137 const u8 BOM_UTF8_LEN = 3;
 138 const u8 BOM_UTF16_LEN = 1;
 139 const u8 BOM_UTF32_LEN = 1;
 140
 141 //! Unicode byte order marks for file operations.
 142 const u8 BOM_ENCODE_UTF8[3] = { 0xEF, 0xBB, 0xBF };
 143 const u8 BOM_ENCODE_UTF16_BE[2] = { 0xFE, 0xFF };
 144 const u8 BOM_ENCODE_UTF16_LE[2] = { 0xFF, 0xFE };
 145 const u8 BOM_ENCODE_UTF32_BE[4] = { 0x00, 0x00, 0xFE, 0xFF };
 146 const u8 BOM_ENCODE_UTF32_LE[4] = { 0xFF, 0xFE, 0x00, 0x00 };
 147
 148 //! The size in bytes of the Unicode byte marks for file operations.
 149 const u8 BOM_ENCODE_UTF8_LEN = 3;
 150 const u8 BOM_ENCODE_UTF16_LEN = 2;
 151 const u8 BOM_ENCODE_UTF32_LEN = 4;
 152
 153 //! Unicode encoding type.
 154 enum EUTF_ENCODE
 155 {
 156         EUTFE_NONE              = 0,
 157         EUTFE_UTF8,
 158         EUTFE_UTF16,
 159         EUTFE_UTF16_LE,
 160         EUTFE_UTF16_BE,
 161         EUTFE_UTF32,
 162         EUTFE_UTF32_LE,
 163         EUTFE_UTF32_BE
 164 };
 165
 166 //! Unicode endianness.
 167 enum EUTF_ENDIAN
 168 {
 169         EUTFEE_NATIVE   = 0,
 170         EUTFEE_LITTLE,
 171         EUTFEE_BIG
 172 };
 173
 174 //! Returns the specified unicode byte order mark in a byte array.
 175 //! The byte order mark is the first few bytes in a text file that signifies its encoding.
 176 /** \param mode The Unicode encoding method that we want to get the byte order mark for.
 177                 If EUTFE_UTF16 or EUTFE_UTF32 is passed, it uses the native system endianness. **/
 178 //! \return An array that contains a byte order mark.
 179 inline core::array<u8> getUnicodeBOM(EUTF_ENCODE mode)
 180 {
 181 #define COPY_ARRAY(source, size) \
 182         memcpy(ret.pointer(), source, size); \
 183         ret.set_used(size)
 184
 185         core::array<u8> ret(4);
 186         switch (mode)
 187         {
 188                 case EUTFE_UTF8:
 189                         COPY_ARRAY(BOM_ENCODE_UTF8, BOM_ENCODE_UTF8_LEN);
 190                         break;
 191                 case EUTFE_UTF16:
 192                         #ifdef __BIG_ENDIAN__
 193                                 COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
 194                         #else
 195                                 COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
 196                         #endif
 197                         break;
 198                 case EUTFE_UTF16_BE:
 199                         COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
 200                         break;
 201                 case EUTFE_UTF16_LE:
 202                         COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
 203                         break;
 204                 case EUTFE_UTF32:
 205                         #ifdef __BIG_ENDIAN__
 206                                 COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
 207                         #else
 208                                 COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
 209                         #endif
 210                         break;
 211                 case EUTFE_UTF32_BE:
 212                         COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
 213                         break;
 214                 case EUTFE_UTF32_LE:
 215                         COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
 216                         break;
 217                 case EUTFE_NONE:
 218                         // TODO sapier: fixed warning only,
 219                         // don't know if something needs to be done here
 220                         break;
 221         }
 222         return ret;
 223
 224 #undef COPY_ARRAY
 225 }
 226
 227 //! Detects if the given data stream starts with a unicode BOM.
 228 //! \param data The data stream to check.
 229 //! \return The unicode BOM associated with the data stream, or EUTFE_NONE if none was found.
 230 inline EUTF_ENCODE determineUnicodeBOM(const char* data)
 231 {
 232         if (memcmp(data, BOM_ENCODE_UTF8, 3) == 0) return EUTFE_UTF8;
 233         if (memcmp(data, BOM_ENCODE_UTF16_BE, 2) == 0) return EUTFE_UTF16_BE;
 234         if (memcmp(data, BOM_ENCODE_UTF16_LE, 2) == 0) return EUTFE_UTF16_LE;
 235         if (memcmp(data, BOM_ENCODE_UTF32_BE, 4) == 0) return EUTFE_UTF32_BE;
 236         if (memcmp(data, BOM_ENCODE_UTF32_LE, 4) == 0) return EUTFE_UTF32_LE;
 237         return EUTFE_NONE;
 238 }
 239
 240 } // end namespace unicode
 241
 242
 243 //! UTF-16 string class.
 244 template <typename TAlloc = irrAllocator<uchar16_t> >
 245 class ustring16
 246 {
 247 public:
 248
 249         ///------------------///
 250         /// iterator classes ///
 251         ///------------------///
 252
 253         //! Access an element in a unicode string, allowing one to change it.
 254         class _ustring16_iterator_access
 255         {
 256                 public:
 257                         _ustring16_iterator_access(const ustring16<TAlloc>* s, u32 p) : ref(s), pos(p) {}
 258
 259                         //! Allow the class to be interpreted as a single UTF-32 character.
 260                         operator uchar32_t() const
 261                         {
 262                                 return _get();
 263                         }
 264
 265                         //! Allow one to change the character in the unicode string.
 266                         //! \param c The new character to use.
 267                         //! \return Myself.
 268                         _ustring16_iterator_access& operator=(const uchar32_t c)
 269                         {
 270                                 _set(c);
 271                                 return *this;
 272                         }
 273
 274                         //! Increments the value by 1.
 275                         //! \return Myself.
 276                         _ustring16_iterator_access& operator++()
 277                         {
 278                                 _set(_get() + 1);
 279                                 return *this;
 280                         }
 281
 282                         //! Increments the value by 1, returning the old value.
 283                         //! \return A unicode character.
 284                         uchar32_t operator++(int)
 285                         {
 286                                 uchar32_t old = _get();
 287                                 _set(old + 1);
 288                                 return old;
 289                         }
 290
 291                         //! Decrements the value by 1.
 292                         //! \return Myself.
 293                         _ustring16_iterator_access& operator--()
 294                         {
 295                                 _set(_get() - 1);
 296                                 return *this;
 297                         }
 298
 299                         //! Decrements the value by 1, returning the old value.
 300                         //! \return A unicode character.
 301                         uchar32_t operator--(int)
 302                         {
 303                                 uchar32_t old = _get();
 304                                 _set(old - 1);
 305                                 return old;
 306                         }
 307
 308                         //! Adds to the value by a specified amount.
 309                         //! \param val The amount to add to this character.
 310                         //! \return Myself.
 311                         _ustring16_iterator_access& operator+=(int val)
 312                         {
 313                                 _set(_get() + val);
 314                                 return *this;
 315                         }
 316
 317                         //! Subtracts from the value by a specified amount.
 318                         //! \param val The amount to subtract from this character.
 319                         //! \return Myself.
 320                         _ustring16_iterator_access& operator-=(int val)
 321                         {
 322                                 _set(_get() - val);
 323                                 return *this;
 324                         }
 325
 326                         //! Multiples the value by a specified amount.
 327                         //! \param val The amount to multiply this character by.
 328                         //! \return Myself.
 329                         _ustring16_iterator_access& operator*=(int val)
 330                         {
 331                                 _set(_get() * val);
 332                                 return *this;
 333                         }
 334
 335                         //! Divides the value by a specified amount.
 336                         //! \param val The amount to divide this character by.
 337                         //! \return Myself.
 338                         _ustring16_iterator_access& operator/=(int val)
 339                         {
 340                                 _set(_get() / val);
 341                                 return *this;
 342                         }
 343
 344                         //! Modulos the value by a specified amount.
 345                         //! \param val The amount to modulo this character by.
 346                         //! \return Myself.
 347                         _ustring16_iterator_access& operator%=(int val)
 348                         {
 349                                 _set(_get() % val);
 350                                 return *this;
 351                         }
 352
 353                         //! Adds to the value by a specified amount.
 354                         //! \param val The amount to add to this character.
 355                         //! \return A unicode character.
 356                         uchar32_t operator+(int val) const
 357                         {
 358                                 return _get() + val;
 359                         }
 360
 361                         //! Subtracts from the value by a specified amount.
 362                         //! \param val The amount to subtract from this character.
 363                         //! \return A unicode character.
 364                         uchar32_t operator-(int val) const
 365                         {
 366                                 return _get() - val;
 367                         }
 368
 369                         //! Multiplies the value by a specified amount.
 370                         //! \param val The amount to multiply this character by.
 371                         //! \return A unicode character.
 372                         uchar32_t operator*(int val) const
 373                         {
 374                                 return _get() * val;
 375                         }
 376
 377                         //! Divides the value by a specified amount.
 378                         //! \param val The amount to divide this character by.
 379                         //! \return A unicode character.
 380                         uchar32_t operator/(int val) const
 381                         {
 382                                 return _get() / val;
 383                         }
 384
 385                         //! Modulos the value by a specified amount.
 386                         //! \param val The amount to modulo this character by.
 387                         //! \return A unicode character.
 388                         uchar32_t operator%(int val) const
 389                         {
 390                                 return _get() % val;
 391                         }
 392
 393                 private:
 394                         //! Gets a uchar32_t from our current position.
 395                         uchar32_t _get() const
 396                         {
 397                                 const uchar16_t* a = ref->c_str();
 398                                 if (!UTF16_IS_SURROGATE(a[pos]))
 399                                         return static_cast<uchar32_t>(a[pos]);
 400                                 else
 401                                 {
 402                                         if (pos + 1 >= ref->size_raw())
 403                                                 return 0;
 404
 405                                         return unicode::toUTF32(a[pos], a[pos + 1]);
 406                                 }
 407                         }
 408
 409                         //! Sets a uchar32_t at our current position.
 410                         void _set(uchar32_t c)
 411                         {
 412                                 ustring16<TAlloc>* ref2 = const_cast<ustring16<TAlloc>*>(ref);
 413                                 const uchar16_t* a = ref2->c_str();
 414                                 if (c > 0xFFFF)
 415                                 {
 416                                         // c will be multibyte, so split it up into the high and low surrogate pairs.
 417                                         uchar16_t x = static_cast<uchar16_t>(c);
 418                                         uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
 419                                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
 420
 421                                         // If the previous position was a surrogate pair, just replace them.  Else, insert the low pair.
 422                                         if (UTF16_IS_SURROGATE_HI(a[pos]) && pos + 1 != ref2->size_raw())
 423                                                 ref2->replace_raw(vl, static_cast<u32>(pos) + 1);
 424                                         else ref2->insert_raw(vl, static_cast<u32>(pos) + 1);
 425
 426                                         ref2->replace_raw(vh, static_cast<u32>(pos));
 427                                 }
 428                                 else
 429                                 {
 430                                         // c will be a single byte.
 431                                         uchar16_t vh = static_cast<uchar16_t>(c);
 432
 433                                         // If the previous position was a surrogate pair, remove the extra byte.
 434                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 435                                                 ref2->erase_raw(static_cast<u32>(pos) + 1);
 436
 437                                         ref2->replace_raw(vh, static_cast<u32>(pos));
 438                                 }
 439                         }
 440
 441                         const ustring16<TAlloc>* ref;
 442                         u32 pos;
 443         };
 444         typedef typename ustring16<TAlloc>::_ustring16_iterator_access access;
 445
 446
 447         //! Iterator to iterate through a UTF-16 string.
 448 #ifndef USTRING_NO_STL
 449         class _ustring16_const_iterator : public std::iterator<
 450                 std::bidirectional_iterator_tag,        // iterator_category
 451                 access,                                                         // value_type
 452                 ptrdiff_t,                                                      // difference_type
 453                 const access,                                           // pointer
 454                 const access                                            // reference
 455         >
 456 #else
 457         class _ustring16_const_iterator
 458 #endif
 459         {
 460                 public:
 461                         typedef _ustring16_const_iterator _Iter;
 462                         typedef std::iterator<std::bidirectional_iterator_tag, access, ptrdiff_t, const access, const access> _Base;
 463                         typedef const access const_pointer;
 464                         typedef const access const_reference;
 465
 466 #ifndef USTRING_NO_STL
 467                         typedef typename _Base::value_type value_type;
 468                         typedef typename _Base::difference_type difference_type;
 469                         typedef typename _Base::difference_type distance_type;
 470                         typedef typename _Base::pointer pointer;
 471                         typedef const_reference reference;
 472 #else
 473                         typedef access value_type;
 474                         typedef u32 difference_type;
 475                         typedef u32 distance_type;
 476                         typedef const_pointer pointer;
 477                         typedef const_reference reference;
 478 #endif
 479
 480                         //! Constructors.
 481                         _ustring16_const_iterator(const _Iter& i) : ref(i.ref), pos(i.pos) {}
 482                         _ustring16_const_iterator(const ustring16<TAlloc>& s) : ref(&s), pos(0) {}
 483                         _ustring16_const_iterator(const ustring16<TAlloc>& s, const u32 p) : ref(&s), pos(0)
 484                         {
 485                                 if (ref->size_raw() == 0 || p == 0)
 486                                         return;
 487
 488                                 // Go to the appropriate position.
 489                                 u32 i = p;
 490                                 u32 sr = ref->size_raw();
 491                                 const uchar16_t* a = ref->c_str();
 492                                 while (i != 0 && pos < sr)
 493                                 {
 494                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 495                                                 pos += 2;
 496                                         else ++pos;
 497                                         --i;
 498                                 }
 499                         }
 500
 501                         //! Test for equalness.
 502                         bool operator==(const _Iter& iter) const
 503                         {
 504                                 if (ref == iter.ref && pos == iter.pos)
 505                                         return true;
 506                                 return false;
 507                         }
 508
 509                         //! Test for unequalness.
 510                         bool operator!=(const _Iter& iter) const
 511                         {
 512                                 if (ref != iter.ref || pos != iter.pos)
 513                                         return true;
 514                                 return false;
 515                         }
 516
 517                         //! Switch to the next full character in the string.
 518                         _Iter& operator++()
 519                         {       // ++iterator
 520                                 if (pos == ref->size_raw()) return *this;
 521                                 const uchar16_t* a = ref->c_str();
 522                                 if (UTF16_IS_SURROGATE_HI(a[pos]))
 523                                         pos += 2;                       // TODO: check for valid low surrogate?
 524                                 else ++pos;
 525                                 if (pos > ref->size_raw()) pos = ref->size_raw();
 526                                 return *this;
 527                         }
 528
 529                         //! Switch to the next full character in the string, returning the previous position.
 530                         _Iter operator++(int)
 531                         {       // iterator++
 532                                 _Iter _tmp(*this);
 533                                 ++*this;
 534                                 return _tmp;
 535                         }
 536
 537                         //! Switch to the previous full character in the string.
 538                         _Iter& operator--()
 539                         {       // --iterator
 540                                 if (pos == 0) return *this;
 541                                 const uchar16_t* a = ref->c_str();
 542                                 --pos;
 543                                 if (UTF16_IS_SURROGATE_LO(a[pos]) && pos != 0)  // low surrogate, go back one more.
 544                                         --pos;
 545                                 return *this;
 546                         }
 547
 548                         //! Switch to the previous full character in the string, returning the previous position.
 549                         _Iter operator--(int)
 550                         {       // iterator--
 551                                 _Iter _tmp(*this);
 552                                 --*this;
 553                                 return _tmp;
 554                         }
 555
 556                         //! Advance a specified number of full characters in the string.
 557                         //! \return Myself.
 558                         _Iter& operator+=(const difference_type v)
 559                         {
 560                                 if (v == 0) return *this;
 561                                 if (v < 0) return operator-=(v * -1);
 562
 563                                 if (pos >= ref->size_raw())
 564                                         return *this;
 565
 566                                 // Go to the appropriate position.
 567                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
 568                                 u32 i = (u32)v;
 569                                 u32 sr = ref->size_raw();
 570                                 const uchar16_t* a = ref->c_str();
 571                                 while (i != 0 && pos < sr)
 572                                 {
 573                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 574                                                 pos += 2;
 575                                         else ++pos;
 576                                         --i;
 577                                 }
 578                                 if (pos > sr)
 579                                         pos = sr;
 580
 581                                 return *this;
 582                         }
 583
 584                         //! Go back a specified number of full characters in the string.
 585                         //! \return Myself.
 586                         _Iter& operator-=(const difference_type v)
 587                         {
 588                                 if (v == 0) return *this;
 589                                 if (v > 0) return operator+=(v * -1);
 590
 591                                 if (pos == 0)
 592                                         return *this;
 593
 594                                 // Go to the appropriate position.
 595                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
 596                                 u32 i = (u32)v;
 597                                 const uchar16_t* a = ref->c_str();
 598                                 while (i != 0 && pos != 0)
 599                                 {
 600                                         --pos;
 601                                         if (UTF16_IS_SURROGATE_LO(a[pos]) != 0 && pos != 0)
 602                                                 --pos;
 603                                         --i;
 604                                 }
 605
 606                                 return *this;
 607                         }
 608
 609                         //! Return a new iterator that is a variable number of full characters forward from the current position.
 610                         _Iter operator+(const difference_type v) const
 611                         {
 612                                 _Iter ret(*this);
 613                                 ret += v;
 614                                 return ret;
 615                         }
 616
 617                         //! Return a new iterator that is a variable number of full characters backward from the current position.
 618                         _Iter operator-(const difference_type v) const
 619                         {
 620                                 _Iter ret(*this);
 621                                 ret -= v;
 622                                 return ret;
 623                         }
 624
 625                         //! Returns the distance between two iterators.
 626                         difference_type operator-(const _Iter& iter) const
 627                         {
 628                                 // Make sure we reference the same object!
 629                                 if (ref != iter.ref)
 630                                         return difference_type();
 631
 632                                 _Iter i = iter;
 633                                 difference_type ret;
 634
 635                                 // Walk up.
 636                                 if (pos > i.pos)
 637                                 {
 638                                         while (pos > i.pos)
 639                                         {
 640                                                 ++i;
 641                                                 ++ret;
 642                                         }
 643                                         return ret;
 644                                 }
 645
 646                                 // Walk down.
 647                                 while (pos < i.pos)
 648                                 {
 649                                         --i;
 650                                         --ret;
 651                                 }
 652                                 return ret;
 653                         }
 654
 655                         //! Accesses the full character at the iterator's position.
 656                         const_reference operator*() const
 657                         {
 658                                 if (pos >= ref->size_raw())
 659                                 {
 660                                         const uchar16_t* a = ref->c_str();
 661                                         u32 p = ref->size_raw();
 662                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 663                                                 --p;
 664                                         reference ret(ref, p);
 665                                         return ret;
 666                                 }
 667                                 const_reference ret(ref, pos);
 668                                 return ret;
 669                         }
 670
 671                         //! Accesses the full character at the iterator's position.
 672                         reference operator*()
 673                         {
 674                                 if (pos >= ref->size_raw())
 675                                 {
 676                                         const uchar16_t* a = ref->c_str();
 677                                         u32 p = ref->size_raw();
 678                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 679                                                 --p;
 680                                         reference ret(ref, p);
 681                                         return ret;
 682                                 }
 683                                 reference ret(ref, pos);
 684                                 return ret;
 685                         }
 686
 687                         //! Accesses the full character at the iterator's position.
 688                         const_pointer operator->() const
 689                         {
 690                                 return operator*();
 691                         }
 692
 693                         //! Accesses the full character at the iterator's position.
 694                         pointer operator->()
 695                         {
 696                                 return operator*();
 697                         }
 698
 699                         //! Is the iterator at the start of the string?
 700                         bool atStart() const
 701                         {
 702                                 return pos == 0;
 703                         }
 704
 705                         //! Is the iterator at the end of the string?
 706                         bool atEnd() const
 707                         {
 708                                 const uchar16_t* a = ref->c_str();
 709                                 if (UTF16_IS_SURROGATE(a[pos]))
 710                                         return (pos + 1) >= ref->size_raw();
 711                                 else return pos >= ref->size_raw();
 712                         }
 713
 714                         //! Moves the iterator to the start of the string.
 715                         void toStart()
 716                         {
 717                                 pos = 0;
 718                         }
 719
 720                         //! Moves the iterator to the end of the string.
 721                         void toEnd()
 722                         {
 723                                 pos = ref->size_raw();
 724                         }
 725
 726                         //! Returns the iterator's position.
 727                         //! \return The iterator's position.
 728                         u32 getPos() const
 729                         {
 730                                 return pos;
 731                         }
 732
 733                 protected:
 734                         const ustring16<TAlloc>* ref;
 735                         u32 pos;
 736         };
 737
 738         //! Iterator to iterate through a UTF-16 string.
 739         class _ustring16_iterator : public _ustring16_const_iterator
 740         {
 741                 public:
 742                         typedef _ustring16_iterator _Iter;
 743                         typedef _ustring16_const_iterator _Base;
 744                         typedef typename _Base::const_pointer const_pointer;
 745                         typedef typename _Base::const_reference const_reference;
 746
 747
 748                         typedef typename _Base::value_type value_type;
 749                         typedef typename _Base::difference_type difference_type;
 750                         typedef typename _Base::distance_type distance_type;
 751                         typedef access pointer;
 752                         typedef access reference;
 753
 754                         using _Base::pos;
 755                         using _Base::ref;
 756
 757                         //! Constructors.
 758                         _ustring16_iterator(const _Iter& i) : _ustring16_const_iterator(i) {}
 759                         _ustring16_iterator(const ustring16<TAlloc>& s) : _ustring16_const_iterator(s) {}
 760                         _ustring16_iterator(const ustring16<TAlloc>& s, const u32 p) : _ustring16_const_iterator(s, p) {}
 761
 762                         //! Accesses the full character at the iterator's position.
 763                         reference operator*() const
 764                         {
 765                                 if (pos >= ref->size_raw())
 766                                 {
 767                                         const uchar16_t* a = ref->c_str();
 768                                         u32 p = ref->size_raw();
 769                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 770                                                 --p;
 771                                         reference ret(ref, p);
 772                                         return ret;
 773                                 }
 774                                 reference ret(ref, pos);
 775                                 return ret;
 776                         }
 777
 778                         //! Accesses the full character at the iterator's position.
 779                         reference operator*()
 780                         {
 781                                 if (pos >= ref->size_raw())
 782                                 {
 783                                         const uchar16_t* a = ref->c_str();
 784                                         u32 p = ref->size_raw();
 785                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 786                                                 --p;
 787                                         reference ret(ref, p);
 788                                         return ret;
 789                                 }
 790                                 reference ret(ref, pos);
 791                                 return ret;
 792                         }
 793
 794                         //! Accesses the full character at the iterator's position.
 795                         pointer operator->() const
 796                         {
 797                                 return operator*();
 798                         }
 799
 800                         //! Accesses the full character at the iterator's position.
 801                         pointer operator->()
 802                         {
 803                                 return operator*();
 804                         }
 805         };
 806
 807         typedef typename ustring16<TAlloc>::_ustring16_iterator iterator;
 808         typedef typename ustring16<TAlloc>::_ustring16_const_iterator const_iterator;
 809
 810         ///----------------------///
 811         /// end iterator classes ///
 812         ///----------------------///
 813
 814         //! Default constructor
 815         ustring16()
 816         : array(0), allocated(1), used(0)
 817         {
 818 #if __BYTE_ORDER == __BIG_ENDIAN
 819                 encoding = unicode::EUTFE_UTF16_BE;
 820 #else
 821                 encoding = unicode::EUTFE_UTF16_LE;
 822 #endif
 823                 array = allocator.allocate(1); // new u16[1];
 824                 array[0] = 0x0;
 825         }
 826
 827
 828         //! Constructor
 829         ustring16(const ustring16<TAlloc>& other)
 830         : array(0), allocated(0), used(0)
 831         {
 832 #if __BYTE_ORDER == __BIG_ENDIAN
 833                 encoding = unicode::EUTFE_UTF16_BE;
 834 #else
 835                 encoding = unicode::EUTFE_UTF16_LE;
 836 #endif
 837                 *this = other;
 838         }
 839
 840
 841         //! Constructor from other string types
 842         template <class B, class A>
 843         ustring16(const string<B, A>& other)
 844         : array(0), allocated(0), used(0)
 845         {
 846 #if __BYTE_ORDER == __BIG_ENDIAN
 847                 encoding = unicode::EUTFE_UTF16_BE;
 848 #else
 849                 encoding = unicode::EUTFE_UTF16_LE;
 850 #endif
 851                 *this = other;
 852         }
 853
 854
 855 #ifndef USTRING_NO_STL
 856         //! Constructor from std::string
 857         template <class B, class A, typename Alloc>
 858         ustring16(const std::basic_string<B, A, Alloc>& other)
 859         : array(0), allocated(0), used(0)
 860         {
 861 #if __BYTE_ORDER == __BIG_ENDIAN
 862                 encoding = unicode::EUTFE_UTF16_BE;
 863 #else
 864                 encoding = unicode::EUTFE_UTF16_LE;
 865 #endif
 866                 *this = other.c_str();
 867         }
 868
 869
 870         //! Constructor from iterator.
 871         template <typename Itr>
 872         ustring16(Itr first, Itr last)
 873         : array(0), allocated(0), used(0)
 874         {
 875 #if __BYTE_ORDER == __BIG_ENDIAN
 876                 encoding = unicode::EUTFE_UTF16_BE;
 877 #else
 878                 encoding = unicode::EUTFE_UTF16_LE;
 879 #endif
 880                 reserve(std::distance(first, last));
 881                 array[used] = 0;
 882
 883                 for (; first != last; ++first)
 884                         append((uchar32_t)*first);
 885         }
 886 #endif
 887
 888
 889 #ifndef USTRING_CPP0X_NEWLITERALS
 890         //! Constructor for copying a character string from a pointer.
 891         ustring16(const char* const c)
 892         : array(0), allocated(0), used(0)
 893         {
 894 #if __BYTE_ORDER == __BIG_ENDIAN
 895                 encoding = unicode::EUTFE_UTF16_BE;
 896 #else
 897                 encoding = unicode::EUTFE_UTF16_LE;
 898 #endif
 899
 900                 loadDataStream(c, strlen(c));
 901                 //append((uchar8_t*)c);
 902         }
 903
 904
 905         //! Constructor for copying a character string from a pointer with a given length.
 906         ustring16(const char* const c, u32 length)
 907         : array(0), allocated(0), used(0)
 908         {
 909 #if __BYTE_ORDER == __BIG_ENDIAN
 910                 encoding = unicode::EUTFE_UTF16_BE;
 911 #else
 912                 encoding = unicode::EUTFE_UTF16_LE;
 913 #endif
 914
 915                 loadDataStream(c, length);
 916         }
 917 #endif
 918
 919
 920         //! Constructor for copying a UTF-8 string from a pointer.
 921         ustring16(const uchar8_t* const c)
 922         : array(0), allocated(0), used(0)
 923         {
 924 #if __BYTE_ORDER == __BIG_ENDIAN
 925                 encoding = unicode::EUTFE_UTF16_BE;
 926 #else
 927                 encoding = unicode::EUTFE_UTF16_LE;
 928 #endif
 929
 930                 append(c);
 931         }
 932
 933
 934         //! Constructor for copying a UTF-8 string from a single char.
 935         ustring16(const char c)
 936         : array(0), allocated(0), used(0)
 937         {
 938 #if __BYTE_ORDER == __BIG_ENDIAN
 939                 encoding = unicode::EUTFE_UTF16_BE;
 940 #else
 941                 encoding = unicode::EUTFE_UTF16_LE;
 942 #endif
 943
 944                 append((uchar32_t)c);
 945         }
 946
 947
 948         //! Constructor for copying a UTF-8 string from a pointer with a given length.
 949         ustring16(const uchar8_t* const c, u32 length)
 950         : array(0), allocated(0), used(0)
 951         {
 952 #if __BYTE_ORDER == __BIG_ENDIAN
 953                 encoding = unicode::EUTFE_UTF16_BE;
 954 #else
 955                 encoding = unicode::EUTFE_UTF16_LE;
 956 #endif
 957
 958                 append(c, length);
 959         }
 960
 961
 962         //! Constructor for copying a UTF-16 string from a pointer.
 963         ustring16(const uchar16_t* const c)
 964         : array(0), allocated(0), used(0)
 965         {
 966 #if __BYTE_ORDER == __BIG_ENDIAN
 967                 encoding = unicode::EUTFE_UTF16_BE;
 968 #else
 969                 encoding = unicode::EUTFE_UTF16_LE;
 970 #endif
 971
 972                 append(c);
 973         }
 974
 975
 976         //! Constructor for copying a UTF-16 string from a pointer with a given length
 977         ustring16(const uchar16_t* const c, u32 length)
 978         : array(0), allocated(0), used(0)
 979         {
 980 #if __BYTE_ORDER == __BIG_ENDIAN
 981                 encoding = unicode::EUTFE_UTF16_BE;
 982 #else
 983                 encoding = unicode::EUTFE_UTF16_LE;
 984 #endif
 985
 986                 append(c, length);
 987         }
 988
 989
 990         //! Constructor for copying a UTF-32 string from a pointer.
 991         ustring16(const uchar32_t* const c)
 992         : array(0), allocated(0), used(0)
 993         {
 994 #if __BYTE_ORDER == __BIG_ENDIAN
 995                 encoding = unicode::EUTFE_UTF16_BE;
 996 #else
 997                 encoding = unicode::EUTFE_UTF16_LE;
 998 #endif
 999
1000                 append(c);
1001         }
1002
1003
1004         //! Constructor for copying a UTF-32 from a pointer with a given length.
1005         ustring16(const uchar32_t* const c, u32 length)
1006         : array(0), allocated(0), used(0)
1007         {
1008 #if __BYTE_ORDER == __BIG_ENDIAN
1009                 encoding = unicode::EUTFE_UTF16_BE;
1010 #else
1011                 encoding = unicode::EUTFE_UTF16_LE;
1012 #endif
1013
1014                 append(c, length);
1015         }
1016
1017
1018         //! Constructor for copying a wchar_t string from a pointer.
1019         ustring16(const wchar_t* const c)
1020         : array(0), allocated(0), used(0)
1021         {
1022 #if __BYTE_ORDER == __BIG_ENDIAN
1023                 encoding = unicode::EUTFE_UTF16_BE;
1024 #else
1025                 encoding = unicode::EUTFE_UTF16_LE;
1026 #endif
1027
1028                 if (sizeof(wchar_t) == 4)
1029                         append(reinterpret_cast<const uchar32_t* const>(c));
1030                 else if (sizeof(wchar_t) == 2)
1031                         append(reinterpret_cast<const uchar16_t* const>(c));
1032                 else if (sizeof(wchar_t) == 1)
1033                         append(reinterpret_cast<const uchar8_t* const>(c));
1034         }
1035
1036
1037         //! Constructor for copying a wchar_t string from a pointer with a given length.
1038         ustring16(const wchar_t* const c, u32 length)
1039         : array(0), allocated(0), used(0)
1040         {
1041 #if __BYTE_ORDER == __BIG_ENDIAN
1042                 encoding = unicode::EUTFE_UTF16_BE;
1043 #else
1044                 encoding = unicode::EUTFE_UTF16_LE;
1045 #endif
1046
1047                 if (sizeof(wchar_t) == 4)
1048                         append(reinterpret_cast<const uchar32_t* const>(c), length);
1049                 else if (sizeof(wchar_t) == 2)
1050                         append(reinterpret_cast<const uchar16_t* const>(c), length);
1051                 else if (sizeof(wchar_t) == 1)
1052                         append(reinterpret_cast<const uchar8_t* const>(c), length);
1053         }
1054
1055
1056 #ifdef USTRING_CPP0X
1057         //! Constructor for moving a ustring16
1058         ustring16(ustring16<TAlloc>&& other)
1059         : array(other.array), encoding(other.encoding), allocated(other.allocated), used(other.used)
1060         {
1061                 //std::cout << "MOVE constructor" << std::endl;
1062                 other.array = 0;
1063                 other.allocated = 0;
1064                 other.used = 0;
1065         }
1066 #endif
1067
1068
1069         //! Destructor
1070         ~ustring16()
1071         {
1072                 allocator.deallocate(array); // delete [] array;
1073         }
1074
1075
1076         //! Assignment operator
1077         ustring16& operator=(const ustring16<TAlloc>& other)
1078         {
1079                 if (this == &other)
1080                         return *this;
1081
1082                 used = other.size_raw();
1083                 if (used >= allocated)
1084                 {
1085                         allocator.deallocate(array); // delete [] array;
1086                         allocated = used + 1;
1087                         array = allocator.allocate(used + 1); //new u16[used];
1088                 }
1089
1090                 const uchar16_t* p = other.c_str();
1091                 for (u32 i=0; i<=used; ++i, ++p)
1092                         array[i] = *p;
1093
1094                 array[used] = 0;
1095
1096                 // Validate our new UTF-16 string.
1097                 validate();
1098
1099                 return *this;
1100         }
1101
1102
1103 #ifdef USTRING_CPP0X
1104         //! Move assignment operator
1105         ustring16& operator=(ustring16<TAlloc>&& other)
1106         {
1107                 if (this != &other)
1108                 {
1109                         //std::cout << "MOVE operator=" << std::endl;
1110                         allocator.deallocate(array);
1111
1112                         array = other.array;
1113                         allocated = other.allocated;
1114                         encoding = other.encoding;
1115                         used = other.used;
1116                         other.array = 0;
1117                         other.used = 0;
1118                 }
1119                 return *this;
1120         }
1121 #endif
1122
1123
1124         //! Assignment operator for other string types
1125         template <class B, class A>
1126         ustring16<TAlloc>& operator=(const string<B, A>& other)
1127         {
1128                 *this = other.c_str();
1129                 return *this;
1130         }
1131
1132
1133         //! Assignment operator for UTF-8 strings
1134         ustring16<TAlloc>& operator=(const uchar8_t* const c)
1135         {
1136                 if (!array)
1137                 {
1138                         array = allocator.allocate(1); //new u16[1];
1139                         allocated = 1;
1140                 }
1141                 used = 0;
1142                 array[used] = 0x0;
1143                 if (!c) return *this;
1144
1145                 //! Append our string now.
1146                 append(c);
1147                 return *this;
1148         }
1149
1150
1151         //! Assignment operator for UTF-16 strings
1152         ustring16<TAlloc>& operator=(const uchar16_t* const c)
1153         {
1154                 if (!array)
1155                 {
1156                         array = allocator.allocate(1); //new u16[1];
1157                         allocated = 1;
1158                 }
1159                 used = 0;
1160                 array[used] = 0x0;
1161                 if (!c) return *this;
1162
1163                 //! Append our string now.
1164                 append(c);
1165                 return *this;
1166         }
1167
1168
1169         //! Assignment operator for UTF-32 strings
1170         ustring16<TAlloc>& operator=(const uchar32_t* const c)
1171         {
1172                 if (!array)
1173                 {
1174                         array = allocator.allocate(1); //new u16[1];
1175                         allocated = 1;
1176                 }
1177                 used = 0;
1178                 array[used] = 0x0;
1179                 if (!c) return *this;
1180
1181                 //! Append our string now.
1182                 append(c);
1183                 return *this;
1184         }
1185
1186
1187         //! Assignment operator for wchar_t strings.
1188         /** Note that this assumes that a correct unicode string is stored in the wchar_t string.
1189                 Since wchar_t changes depending on its platform, it could either be a UTF-8, -16, or -32 string.
1190                 This function assumes you are storing the correct unicode encoding inside the wchar_t string. **/
1191         ustring16<TAlloc>& operator=(const wchar_t* const c)
1192         {
1193                 if (sizeof(wchar_t) == 4)
1194                         *this = reinterpret_cast<const uchar32_t* const>(c);
1195                 else if (sizeof(wchar_t) == 2)
1196                         *this = reinterpret_cast<const uchar16_t* const>(c);
1197                 else if (sizeof(wchar_t) == 1)
1198                         *this = reinterpret_cast<const uchar8_t* const>(c);
1199
1200                 return *this;
1201         }
1202
1203
1204         //! Assignment operator for other strings.
1205         /** Note that this assumes that a correct unicode string is stored in the string. **/
1206         template <class B>
1207         ustring16<TAlloc>& operator=(const B* const c)
1208         {
1209                 if (sizeof(B) == 4)
1210                         *this = reinterpret_cast<const uchar32_t* const>(c);
1211                 else if (sizeof(B) == 2)
1212                         *this = reinterpret_cast<const uchar16_t* const>(c);
1213                 else if (sizeof(B) == 1)
1214                         *this = reinterpret_cast<const uchar8_t* const>(c);
1215
1216                 return *this;
1217         }
1218
1219
1220         //! Direct access operator
1221         access operator [](const u32 index)
1222         {
1223                 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1224                 iterator iter(*this, index);
1225                 return iter.operator*();
1226         }
1227
1228
1229         //! Direct access operator
1230         const access operator [](const u32 index) const
1231         {
1232                 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1233                 const_iterator iter(*this, index);
1234                 return iter.operator*();
1235         }
1236
1237
1238         //! Equality operator
1239         bool operator ==(const uchar16_t* const str) const
1240         {
1241                 if (!str)
1242                         return false;
1243
1244                 u32 i;
1245                 for(i=0; array[i] && str[i]; ++i)
1246                         if (array[i] != str[i])
1247                                 return false;
1248
1249                 return !array[i] && !str[i];
1250         }
1251
1252
1253         //! Equality operator
1254         bool operator ==(const ustring16<TAlloc>& other) const
1255         {
1256                 for(u32 i=0; array[i] && other.array[i]; ++i)
1257                         if (array[i] != other.array[i])
1258                                 return false;
1259
1260                 return used == other.used;
1261         }
1262
1263
1264         //! Is smaller comparator
1265         bool operator <(const ustring16<TAlloc>& other) const
1266         {
1267                 for(u32 i=0; array[i] && other.array[i]; ++i)
1268                 {
1269                         s32 diff = array[i] - other.array[i];
1270                         if ( diff )
1271                                 return diff < 0;
1272                 }
1273
1274                 return used < other.used;
1275         }
1276
1277
1278         //! Inequality operator
1279         bool operator !=(const uchar16_t* const str) const
1280         {
1281                 return !(*this == str);
1282         }
1283
1284
1285         //! Inequality operator
1286         bool operator !=(const ustring16<TAlloc>& other) const
1287         {
1288                 return !(*this == other);
1289         }
1290
1291
1292         //! Returns the length of a ustring16 in full characters.
1293         //! \return Length of a ustring16 in full characters.
1294         u32 size() const
1295         {
1296                 const_iterator i(*this, 0);
1297                 u32 pos = 0;
1298                 while (!i.atEnd())
1299                 {
1300                         ++i;
1301                         ++pos;
1302                 }
1303                 return pos;
1304         }
1305
1306
1307         //! Informs if the ustring is empty or not.
1308         //! \return True if the ustring is empty, false if not.
1309         bool empty() const
1310         {
1311                 return (size_raw() == 0);
1312         }
1313
1314
1315         //! Returns a pointer to the raw UTF-16 string data.
1316         //! \return pointer to C-style NUL terminated array of UTF-16 code points.
1317         const uchar16_t* c_str() const
1318         {
1319                 return array;
1320         }
1321
1322
1323         //! Compares the first n characters of this string with another.
1324         //! \param other Other string to compare to.
1325         //! \param n Number of characters to compare.
1326         //! \return True if the n first characters of both strings are equal.
1327         bool equalsn(const ustring16<TAlloc>& other, u32 n) const
1328         {
1329                 u32 i;
1330                 const uchar16_t* oa = other.c_str();
1331                 for(i=0; array[i] && oa[i] && i < n; ++i)
1332                         if (array[i] != oa[i])
1333                                 return false;
1334
1335                 // if one (or both) of the strings was smaller then they
1336                 // are only equal if they have the same length
1337                 return (i == n) || (used == other.used);
1338         }
1339
1340
1341         //! Compares the first n characters of this string with another.
1342         //! \param str Other string to compare to.
1343         //! \param n Number of characters to compare.
1344         //! \return True if the n first characters of both strings are equal.
1345         bool equalsn(const uchar16_t* const str, u32 n) const
1346         {
1347                 if (!str)
1348                         return false;
1349                 u32 i;
1350                 for(i=0; array[i] && str[i] && i < n; ++i)
1351                         if (array[i] != str[i])
1352                                 return false;
1353
1354                 // if one (or both) of the strings was smaller then they
1355                 // are only equal if they have the same length
1356                 return (i == n) || (array[i] == 0 && str[i] == 0);
1357         }
1358
1359
1360         //! Appends a character to this ustring16
1361         //! \param character The character to append.
1362         //! \return A reference to our current string.
1363         ustring16<TAlloc>& append(uchar32_t character)
1364         {
1365                 if (used + 2 >= allocated)
1366                         reallocate(used + 2);
1367
1368                 if (character > 0xFFFF)
1369                 {
1370                         used += 2;
1371
1372                         // character will be multibyte, so split it up into a surrogate pair.
1373                         uchar16_t x = static_cast<uchar16_t>(character);
1374                         uchar16_t vh = UTF16_HI_SURROGATE | ((((character >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1375                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1376                         array[used-2] = vh;
1377                         array[used-1] = vl;
1378                 }
1379                 else
1380                 {
1381                         ++used;
1382                         array[used-1] = character;
1383                 }
1384                 array[used] = 0;
1385
1386                 return *this;
1387         }
1388
1389
1390         //! Appends a UTF-8 string to this ustring16
1391         //! \param other The UTF-8 string to append.
1392         //! \param length The length of the string to append.
1393         //! \return A reference to our current string.
1394         ustring16<TAlloc>& append(const uchar8_t* const other, u32 length=0xffffffff)
1395         {
1396                 if (!other)
1397                         return *this;
1398
1399                 // Determine if the string is long enough for a BOM.
1400                 u32 len = 0;
1401                 const uchar8_t* p = other;
1402                 do
1403                 {
1404                         ++len;
1405                 } while (*p++ && len < unicode::BOM_ENCODE_UTF8_LEN);
1406
1407                 // Check for BOM.
1408                 unicode::EUTF_ENCODE c_bom = unicode::EUTFE_NONE;
1409                 if (len == unicode::BOM_ENCODE_UTF8_LEN)
1410                 {
1411                         if (memcmp(other, unicode::BOM_ENCODE_UTF8, unicode::BOM_ENCODE_UTF8_LEN) == 0)
1412                                 c_bom = unicode::EUTFE_UTF8;
1413                 }
1414
1415                 // If a BOM was found, don't include it in the string.
1416                 const uchar8_t* c2 = other;
1417                 if (c_bom != unicode::EUTFE_NONE)
1418                 {
1419                         c2 = other + unicode::BOM_UTF8_LEN;
1420                         length -= unicode::BOM_UTF8_LEN;
1421                 }
1422
1423                 // Calculate the size of the string to read in.
1424                 len = 0;
1425                 p = c2;
1426                 do
1427                 {
1428                         ++len;
1429                 } while(*p++ && len < length);
1430                 if (len > length)
1431                         len = length;
1432
1433                 // If we need to grow the array, do it now.
1434                 if (used + len >= allocated)
1435                         reallocate(used + (len * 2));
1436                 u32 start = used;
1437
1438                 // Convert UTF-8 to UTF-16.
1439                 u32 pos = start;
1440                 for (u32 l = 0; l<len;)
1441                 {
1442                         ++used;
1443                         if (((c2[l] >> 6) & 0x03) == 0x02)
1444                         {       // Invalid continuation byte.
1445                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1446                                 ++l;
1447                         }
1448                         else if (c2[l] == 0xC0 || c2[l] == 0xC1)
1449                         {       // Invalid byte - overlong encoding.
1450                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1451                                 ++l;
1452                         }
1453                         else if ((c2[l] & 0xF8) == 0xF0)
1454                         {       // 4 bytes UTF-8, 2 bytes UTF-16.
1455                                 // Check for a full string.
1456                                 if ((l + 3) >= len)
1457                                 {
1458                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1459                                         l += 3;
1460                                         break;
1461                                 }
1462
1463                                 // Validate.
1464                                 bool valid = true;
1465                                 u8 l2 = 0;
1466                                 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1467                                 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1468                                 if (valid && (((c2[l+3] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1469                                 if (!valid)
1470                                 {
1471                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1472                                         l += l2;
1473                                         continue;
1474                                 }
1475
1476                                 // Decode.
1477                                 uchar8_t b1 = ((c2[l] & 0x7) << 2) | ((c2[l+1] >> 4) & 0x3);
1478                                 uchar8_t b2 = ((c2[l+1] & 0xF) << 4) | ((c2[l+2] >> 2) & 0xF);
1479                                 uchar8_t b3 = ((c2[l+2] & 0x3) << 6) | (c2[l+3] & 0x3F);
1480                                 uchar32_t v = b3 | ((uchar32_t)b2 << 8) | ((uchar32_t)b1 << 16);
1481
1482                                 // Split v up into a surrogate pair.
1483                                 uchar16_t x = static_cast<uchar16_t>(v);
1484                                 uchar16_t vh = UTF16_HI_SURROGATE | ((((v >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1485                                 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1486
1487                                 array[pos++] = vh;
1488                                 array[pos++] = vl;
1489                                 l += 4;
1490                                 ++used;         // Using two shorts this time, so increase used by 1.
1491                         }
1492                         else if ((c2[l] & 0xF0) == 0xE0)
1493                         {       // 3 bytes UTF-8, 1 byte UTF-16.
1494                                 // Check for a full string.
1495                                 if ((l + 2) >= len)
1496                                 {
1497                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1498                                         l += 2;
1499                                         break;
1500                                 }
1501
1502                                 // Validate.
1503                                 bool valid = true;
1504                                 u8 l2 = 0;
1505                                 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1506                                 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1507                                 if (!valid)
1508                                 {
1509                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1510                                         l += l2;
1511                                         continue;
1512                                 }
1513
1514                                 // Decode.
1515                                 uchar8_t b1 = ((c2[l] & 0xF) << 4) | ((c2[l+1] >> 2) & 0xF);
1516                                 uchar8_t b2 = ((c2[l+1] & 0x3) << 6) | (c2[l+2] & 0x3F);
1517                                 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1518                                 array[pos++] = ch;
1519                                 l += 3;
1520                         }
1521                         else if ((c2[l] & 0xE0) == 0xC0)
1522                         {       // 2 bytes UTF-8, 1 byte UTF-16.
1523                                 // Check for a full string.
1524                                 if ((l + 1) >= len)
1525                                 {
1526                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1527                                         l += 1;
1528                                         break;
1529                                 }
1530
1531                                 // Validate.
1532                                 if (((c2[l+1] >> 6) & 0x03) != 0x02)
1533                                 {
1534                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1535                                         ++l;
1536                                         continue;
1537                                 }
1538
1539                                 // Decode.
1540                                 uchar8_t b1 = (c2[l] >> 2) & 0x7;
1541                                 uchar8_t b2 = ((c2[l] & 0x3) << 6) | (c2[l+1] & 0x3F);
1542                                 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1543                                 array[pos++] = ch;
1544                                 l += 2;
1545                         }
1546                         else
1547                         {       // 1 byte UTF-8, 1 byte UTF-16.
1548                                 // Validate.
1549                                 if (c2[l] > 0x7F)
1550                                 {       // Values above 0xF4 are restricted and aren't used.  By now, anything above 0x7F is invalid.
1551                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1552                                 }
1553                                 else array[pos++] = static_cast<uchar16_t>(c2[l]);
1554                                 ++l;
1555                         }
1556                 }
1557                 array[used] = 0;
1558
1559                 // Validate our new UTF-16 string.
1560                 validate();
1561
1562                 return *this;
1563         }
1564
1565
1566         //! Appends a UTF-16 string to this ustring16
1567         //! \param other The UTF-16 string to append.
1568         //! \param length The length of the string to append.
1569         //! \return A reference to our current string.
1570         ustring16<TAlloc>& append(const uchar16_t* const other, u32 length=0xffffffff)
1571         {
1572                 if (!other)
1573                         return *this;
1574
1575                 // Determine if the string is long enough for a BOM.
1576                 u32 len = 0;
1577                 const uchar16_t* p = other;
1578                 do
1579                 {
1580                         ++len;
1581                 } while (*p++ && len < unicode::BOM_ENCODE_UTF16_LEN);
1582
1583                 // Check for the BOM to determine the string's endianness.
1584                 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1585                 if (memcmp(other, unicode::BOM_ENCODE_UTF16_LE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1586                         c_end = unicode::EUTFEE_LITTLE;
1587                 else if (memcmp(other, unicode::BOM_ENCODE_UTF16_BE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1588                         c_end = unicode::EUTFEE_BIG;
1589
1590                 // If a BOM was found, don't include it in the string.
1591                 const uchar16_t* c2 = other;
1592                 if (c_end != unicode::EUTFEE_NATIVE)
1593                 {
1594                         c2 = other + unicode::BOM_UTF16_LEN;
1595                         length -= unicode::BOM_UTF16_LEN;
1596                 }
1597
1598                 // Calculate the size of the string to read in.
1599                 len = 0;
1600                 p = c2;
1601                 do
1602                 {
1603                         ++len;
1604                 } while(*p++ && len < length);
1605                 if (len > length)
1606                         len = length;
1607
1608                 // If we need to grow the size of the array, do it now.
1609                 if (used + len >= allocated)
1610                         reallocate(used + (len * 2));
1611                 u32 start = used;
1612                 used += len;
1613
1614                 // Copy the string now.
1615                 unicode::EUTF_ENDIAN m_end = getEndianness();
1616                 for (u32 l = start; l < start + len; ++l)
1617                 {
1618                         array[l] = (uchar16_t)c2[l];
1619                         if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1620                                 array[l] = unicode::swapEndian16(array[l]);
1621                 }
1622
1623                 array[used] = 0;
1624
1625                 // Validate our new UTF-16 string.
1626                 validate();
1627                 return *this;
1628         }
1629
1630
1631         //! Appends a UTF-32 string to this ustring16
1632         //! \param other The UTF-32 string to append.
1633         //! \param length The length of the string to append.
1634         //! \return A reference to our current string.
1635         ustring16<TAlloc>& append(const uchar32_t* const other, u32 length=0xffffffff)
1636         {
1637                 if (!other)
1638                         return *this;
1639
1640                 // Check for the BOM to determine the string's endianness.
1641                 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1642                 if (memcmp(other, unicode::BOM_ENCODE_UTF32_LE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1643                         c_end = unicode::EUTFEE_LITTLE;
1644                 else if (memcmp(other, unicode::BOM_ENCODE_UTF32_BE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1645                         c_end = unicode::EUTFEE_BIG;
1646
1647                 // If a BOM was found, don't include it in the string.
1648                 const uchar32_t* c2 = other;
1649                 if (c_end != unicode::EUTFEE_NATIVE)
1650                 {
1651                         c2 = other + unicode::BOM_UTF32_LEN;
1652                         length -= unicode::BOM_UTF32_LEN;
1653                 }
1654
1655                 // Calculate the size of the string to read in.
1656                 u32 len = 0;
1657                 const uchar32_t* p = c2;
1658                 do
1659                 {
1660                         ++len;
1661                 } while(*p++ && len < length);
1662                 if (len > length)
1663                         len = length;
1664
1665                 // If we need to grow the size of the array, do it now.
1666                 // In case all of the UTF-32 string is split into surrogate pairs, do len * 2.
1667                 if (used + (len * 2) >= allocated)
1668                         reallocate(used + ((len * 2) * 2));
1669                 u32 start = used;
1670
1671                 // Convert UTF-32 to UTF-16.
1672                 unicode::EUTF_ENDIAN m_end = getEndianness();
1673                 u32 pos = start;
1674                 for (u32 l = 0; l<len; ++l)
1675                 {
1676                         ++used;
1677
1678                         uchar32_t ch = c2[l];
1679                         if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1680                                 ch = unicode::swapEndian32(ch);
1681
1682                         if (ch > 0xFFFF)
1683                         {
1684                                 // Split ch up into a surrogate pair as it is over 16 bits long.
1685                                 uchar16_t x = static_cast<uchar16_t>(ch);
1686                                 uchar16_t vh = UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1687                                 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1688                                 array[pos++] = vh;
1689                                 array[pos++] = vl;
1690                                 ++used;         // Using two shorts, so increased used again.
1691                         }
1692                         else if (ch >= 0xD800 && ch <= 0xDFFF)
1693                         {
1694                                 // Between possible UTF-16 surrogates (invalid!)
1695                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1696                         }
1697                         else array[pos++] = static_cast<uchar16_t>(ch);
1698                 }
1699                 array[used] = 0;
1700
1701                 // Validate our new UTF-16 string.
1702                 validate();
1703
1704                 return *this;
1705         }
1706
1707
1708         //! Appends a ustring16 to this ustring16
1709         //! \param other The string to append to this one.
1710         //! \return A reference to our current string.
1711         ustring16<TAlloc>& append(const ustring16<TAlloc>& other)
1712         {
1713                 const uchar16_t* oa = other.c_str();
1714
1715                 u32 len = other.size_raw();
1716
1717                 if (used + len >= allocated)
1718                         reallocate(used + len);
1719
1720                 for (u32 l=0; l<len; ++l)
1721                         array[used+l] = oa[l];
1722
1723                 used += len;
1724                 array[used] = 0;
1725
1726                 return *this;
1727         }
1728
1729
1730         //! Appends a certain amount of characters of a ustring16 to this ustring16.
1731         //! \param other The string to append to this one.
1732         //! \param length How many characters of the other string to add to this one.
1733         //! \return A reference to our current string.
1734         ustring16<TAlloc>& append(const ustring16<TAlloc>& other, u32 length)
1735         {
1736                 if (other.size() == 0)
1737                         return *this;
1738
1739                 if (other.size() < length)
1740                 {
1741                         append(other);
1742                         return *this;
1743                 }
1744
1745                 if (used + length * 2 >= allocated)
1746                         reallocate(used + length * 2);
1747
1748                 const_iterator iter(other, 0);
1749                 u32 l = length;
1750                 while (!iter.atEnd() && l)
1751                 {
1752                         uchar32_t c = *iter;
1753                         append(c);
1754                         ++iter;
1755                         --l;
1756                 }
1757
1758                 return *this;
1759         }
1760
1761
1762         //! Reserves some memory.
1763         //! \param count The amount of characters to reserve.
1764         void reserve(u32 count)
1765         {
1766                 if (count < allocated)
1767                         return;
1768
1769                 reallocate(count);
1770         }
1771
1772
1773         //! Finds first occurrence of character.
1774         //! \param c The character to search for.
1775         //! \return Position where the character has been found, or -1 if not found.
1776         s32 findFirst(uchar32_t c) const
1777         {
1778                 const_iterator i(*this, 0);
1779
1780                 s32 pos = 0;
1781                 while (!i.atEnd())
1782                 {
1783                         uchar32_t t = *i;
1784                         if (c == t)
1785                                 return pos;
1786                         ++pos;
1787                         ++i;
1788                 }
1789
1790                 return -1;
1791         }
1792
1793         //! Finds first occurrence of a character of a list.
1794         //! \param c A list of characters to find. For example if the method should find the first occurrence of 'a' or 'b', this parameter should be "ab".
1795         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1796         //! \return Position where one of the characters has been found, or -1 if not found.
1797         s32 findFirstChar(const uchar32_t* const c, u32 count=1) const
1798         {
1799                 if (!c || !count)
1800                         return -1;
1801
1802                 const_iterator i(*this, 0);
1803
1804                 s32 pos = 0;
1805                 while (!i.atEnd())
1806                 {
1807                         uchar32_t t = *i;
1808                         for (u32 j=0; j<count; ++j)
1809                                 if (t == c[j])
1810                                         return pos;
1811                         ++pos;
1812                         ++i;
1813                 }
1814
1815                 return -1;
1816         }
1817
1818
1819         //! Finds first position of a character not in a given list.
1820         //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1821         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1822         //! \return Position where the character has been found, or -1 if not found.
1823         s32 findFirstCharNotInList(const uchar32_t* const c, u32 count=1) const
1824         {
1825                 if (!c || !count)
1826                         return -1;
1827
1828                 const_iterator i(*this, 0);
1829
1830                 s32 pos = 0;
1831                 while (!i.atEnd())
1832                 {
1833                         uchar32_t t = *i;
1834                         u32 j;
1835                         for (j=0; j<count; ++j)
1836                                 if (t == c[j])
1837                                         break;
1838
1839                         if (j==count)
1840                                 return pos;
1841                         ++pos;
1842                         ++i;
1843                 }
1844
1845                 return -1;
1846         }
1847
1848         //! Finds last position of a character not in a given list.
1849         //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1850         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1851         //! \return Position where the character has been found, or -1 if not found.
1852         s32 findLastCharNotInList(const uchar32_t* const c, u32 count=1) const
1853         {
1854                 if (!c || !count)
1855                         return -1;
1856
1857                 const_iterator i(end());
1858                 --i;
1859
1860                 s32 pos = size() - 1;
1861                 while (!i.atStart())
1862                 {
1863                         uchar32_t t = *i;
1864                         u32 j;
1865                         for (j=0; j<count; ++j)
1866                                 if (t == c[j])
1867                                         break;
1868
1869                         if (j==count)
1870                                 return pos;
1871                         --pos;
1872                         --i;
1873                 }
1874
1875                 return -1;
1876         }
1877
1878         //! Finds next occurrence of character.
1879         //! \param c The character to search for.
1880         //! \param startPos The position in the string to start searching.
1881         //! \return Position where the character has been found, or -1 if not found.
1882         s32 findNext(uchar32_t c, u32 startPos) const
1883         {
1884                 const_iterator i(*this, startPos);
1885
1886                 s32 pos = startPos;
1887                 while (!i.atEnd())
1888                 {
1889                         uchar32_t t = *i;
1890                         if (t == c)
1891                                 return pos;
1892                         ++pos;
1893                         ++i;
1894                 }
1895
1896                 return -1;
1897         }
1898
1899
1900         //! Finds last occurrence of character.
1901         //! \param c The character to search for.
1902         //! \param start The start position of the reverse search ( default = -1, on end ).
1903         //! \return Position where the character has been found, or -1 if not found.
1904         s32 findLast(uchar32_t c, s32 start = -1) const
1905         {
1906                 u32 s = size();
1907                 start = core::clamp ( start < 0 ? (s32)s : start, 0, (s32)s ) - 1;
1908
1909                 const_iterator i(*this, start);
1910                 u32 pos = start;
1911                 while (!i.atStart())
1912                 {
1913                         uchar32_t t = *i;
1914                         if (t == c)
1915                                 return pos;
1916                         --pos;
1917                         --i;
1918                 }
1919
1920                 return -1;
1921         }
1922
1923         //! Finds last occurrence of a character in a list.
1924         //! \param c A list of strings to find. For example if the method should find the last occurrence of 'a' or 'b', this parameter should be "ab".
1925         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1926         //! \return Position where one of the characters has been found, or -1 if not found.
1927         s32 findLastChar(const uchar32_t* const c, u32 count=1) const
1928         {
1929                 if (!c || !count)
1930                         return -1;
1931
1932                 const_iterator i(end());
1933                 --i;
1934
1935                 s32 pos = size();
1936                 while (!i.atStart())
1937                 {
1938                         uchar32_t t = *i;
1939                         for (u32 j=0; j<count; ++j)
1940                                 if (t == c[j])
1941                                         return pos;
1942                         --pos;
1943                         --i;
1944                 }
1945
1946                 return -1;
1947         }
1948
1949
1950         //! Finds another ustring16 in this ustring16.
1951         //! \param str The string to find.
1952         //! \param start The start position of the search.
1953         //! \return Positions where the ustring16 has been found, or -1 if not found.
1954         s32 find(const ustring16<TAlloc>& str, const u32 start = 0) const
1955         {
1956                 u32 my_size = size();
1957                 u32 their_size = str.size();
1958
1959                 if (their_size == 0 || my_size - start < their_size)
1960                         return -1;
1961
1962                 const_iterator i(*this, start);
1963
1964                 s32 pos = start;
1965                 while (!i.atEnd())
1966                 {
1967                         const_iterator i2(i);
1968                         const_iterator j(str, 0);
1969                         uchar32_t t1 = (uchar32_t)*i2;
1970                         uchar32_t t2 = (uchar32_t)*j;
1971                         while (t1 == t2)
1972                         {
1973                                 ++i2;
1974                                 ++j;
1975                                 if (j.atEnd())
1976                                         return pos;
1977                                 t1 = (uchar32_t)*i2;
1978                                 t2 = (uchar32_t)*j;
1979                         }
1980                         ++i;
1981                         ++pos;
1982                 }
1983
1984                 return -1;
1985         }
1986
1987
1988         //! Finds another ustring16 in this ustring16.
1989         //! \param str The string to find.
1990         //! \param start The start position of the search.
1991         //! \return Positions where the string has been found, or -1 if not found.
1992         s32 find_raw(const ustring16<TAlloc>& str, const u32 start = 0) const
1993         {
1994                 const uchar16_t* data = str.c_str();
1995                 if (data && *data)
1996                 {
1997                         u32 len = 0;
1998
1999                         while (data[len])
2000                                 ++len;
2001
2002                         if (len > used)
2003                                 return -1;
2004
2005                         for (u32 i=start; i<=used-len; ++i)
2006                         {
2007                                 u32 j=0;
2008
2009                                 while(data[j] && array[i+j] == data[j])
2010                                         ++j;
2011
2012                                 if (!data[j])
2013                                         return i;
2014                         }
2015                 }
2016
2017                 return -1;
2018         }
2019
2020
2021         //! Returns a substring.
2022         //! \param begin: Start of substring.
2023         //! \param length: Length of substring.
2024         //! \return A reference to our current string.
2025         ustring16<TAlloc> subString(u32 begin, s32 length) const
2026         {
2027                 u32 len = size();
2028                 // if start after ustring16
2029                 // or no proper substring length
2030                 if ((length <= 0) || (begin>=len))
2031                         return ustring16<TAlloc>("");
2032                 // clamp length to maximal value
2033                 if ((length+begin) > len)
2034                         length = len-begin;
2035
2036                 ustring16<TAlloc> o;
2037                 o.reserve((length+1) * 2);
2038
2039                 const_iterator i(*this, begin);
2040                 while (!i.atEnd() && length)
2041                 {
2042                         o.append(*i);
2043                         ++i;
2044                         --length;
2045                 }
2046
2047                 return o;
2048         }
2049
2050
2051         //! Appends a character to this ustring16.
2052         //! \param c Character to append.
2053         //! \return A reference to our current string.
2054         ustring16<TAlloc>& operator += (char c)
2055         {
2056                 append((uchar32_t)c);
2057                 return *this;
2058         }
2059
2060
2061         //! Appends a character to this ustring16.
2062         //! \param c Character to append.
2063         //! \return A reference to our current string.
2064         ustring16<TAlloc>& operator += (uchar32_t c)
2065         {
2066                 append(c);
2067                 return *this;
2068         }
2069
2070
2071         //! Appends a number to this ustring16.
2072         //! \param c Number to append.
2073         //! \return A reference to our current string.
2074         ustring16<TAlloc>& operator += (short c)
2075         {
2076                 append(core::stringc(c));
2077                 return *this;
2078         }
2079
2080
2081         //! Appends a number to this ustring16.
2082         //! \param c Number to append.
2083         //! \return A reference to our current string.
2084         ustring16<TAlloc>& operator += (unsigned short c)
2085         {
2086                 append(core::stringc(c));
2087                 return *this;
2088         }
2089
2090
2091 #ifdef USTRING_CPP0X_NEWLITERALS
2092         //! Appends a number to this ustring16.
2093         //! \param c Number to append.
2094         //! \return A reference to our current string.
2095         ustring16<TAlloc>& operator += (int c)
2096         {
2097                 append(core::stringc(c));
2098                 return *this;
2099         }
2100
2101
2102         //! Appends a number to this ustring16.
2103         //! \param c Number to append.
2104         //! \return A reference to our current string.
2105         ustring16<TAlloc>& operator += (unsigned int c)
2106         {
2107                 append(core::stringc(c));
2108                 return *this;
2109         }
2110 #endif
2111
2112
2113         //! Appends a number to this ustring16.
2114         //! \param c Number to append.
2115         //! \return A reference to our current string.
2116         ustring16<TAlloc>& operator += (long c)
2117         {
2118                 append(core::stringc(c));
2119                 return *this;
2120         }
2121
2122
2123         //! Appends a number to this ustring16.
2124         //! \param c Number to append.
2125         //! \return A reference to our current string.
2126         ustring16<TAlloc>& operator += (unsigned long c)
2127         {
2128                 append(core::stringc(c));
2129                 return *this;
2130         }
2131
2132
2133         //! Appends a number to this ustring16.
2134         //! \param c Number to append.
2135         //! \return A reference to our current string.
2136         ustring16<TAlloc>& operator += (double c)
2137         {
2138                 append(core::stringc(c));
2139                 return *this;
2140         }
2141
2142
2143         //! Appends a char ustring16 to this ustring16.
2144         //! \param c Char ustring16 to append.
2145         //! \return A reference to our current string.
2146         ustring16<TAlloc>& operator += (const uchar16_t* const c)
2147         {
2148                 append(c);
2149                 return *this;
2150         }
2151
2152
2153         //! Appends a ustring16 to this ustring16.
2154         //! \param other ustring16 to append.
2155         //! \return A reference to our current string.
2156         ustring16<TAlloc>& operator += (const ustring16<TAlloc>& other)
2157         {
2158                 append(other);
2159                 return *this;
2160         }
2161
2162
2163         //! Replaces all characters of a given type with another one.
2164         //! \param toReplace Character to replace.
2165         //! \param replaceWith Character replacing the old one.
2166         //! \return A reference to our current string.
2167         ustring16<TAlloc>& replace(uchar32_t toReplace, uchar32_t replaceWith)
2168         {
2169                 iterator i(*this, 0);
2170                 while (!i.atEnd())
2171                 {
2172                         typename ustring16<TAlloc>::access a = *i;
2173                         if ((uchar32_t)a == toReplace)
2174                                 a = replaceWith;
2175                         ++i;
2176                 }
2177                 return *this;
2178         }
2179
2180
2181         //! Replaces all instances of a string with another one.
2182         //! \param toReplace The string to replace.
2183         //! \param replaceWith The string replacing the old one.
2184         //! \return A reference to our current string.
2185         ustring16<TAlloc>& replace(const ustring16<TAlloc>& toReplace, const ustring16<TAlloc>& replaceWith)
2186         {
2187                 if (toReplace.size() == 0)
2188                         return *this;
2189
2190                 const uchar16_t* other = toReplace.c_str();
2191                 const uchar16_t* replace = replaceWith.c_str();
2192                 const u32 other_size = toReplace.size_raw();
2193                 const u32 replace_size = replaceWith.size_raw();
2194
2195                 // Determine the delta.  The algorithm will change depending on the delta.
2196                 s32 delta = replace_size - other_size;
2197
2198                 // A character for character replace.  The string will not shrink or grow.
2199                 if (delta == 0)
2200                 {
2201                         s32 pos = 0;
2202                         while ((pos = find_raw(other, pos)) != -1)
2203                         {
2204                                 for (u32 i = 0; i < replace_size; ++i)
2205                                         array[pos + i] = replace[i];
2206                                 ++pos;
2207                         }
2208                         return *this;
2209                 }
2210
2211                 // We are going to be removing some characters.  The string will shrink.
2212                 if (delta < 0)
2213                 {
2214                         u32 i = 0;
2215                         for (u32 pos = 0; pos <= used; ++i, ++pos)
2216                         {
2217                                 // Is this potentially a match?
2218                                 if (array[pos] == *other)
2219                                 {
2220                                         // Check to see if we have a match.
2221                                         u32 j;
2222                                         for (j = 0; j < other_size; ++j)
2223                                         {
2224                                                 if (array[pos + j] != other[j])
2225                                                         break;
2226                                         }
2227
2228                                         // If we have a match, replace characters.
2229                                         if (j == other_size)
2230                                         {
2231                                                 for (j = 0; j < replace_size; ++j)
2232                                                         array[i + j] = replace[j];
2233                                                 i += replace_size - 1;
2234                                                 pos += other_size - 1;
2235                                                 continue;
2236                                         }
2237                                 }
2238
2239                                 // No match found, just copy characters.
2240                                 array[i - 1] = array[pos];
2241                         }
2242                         array[i] = 0;
2243                         used = i;
2244
2245                         return *this;
2246                 }
2247
2248                 // We are going to be adding characters, so the string size will increase.
2249                 // Count the number of times toReplace exists in the string so we can allocate the new size.
2250                 u32 find_count = 0;
2251                 s32 pos = 0;
2252                 while ((pos = find_raw(other, pos)) != -1)
2253                 {
2254                         ++find_count;
2255                         ++pos;
2256                 }
2257
2258                 // Re-allocate the string now, if needed.
2259                 u32 len = delta * find_count;
2260                 if (used + len >= allocated)
2261                         reallocate(used + len);
2262
2263                 // Start replacing.
2264                 pos = 0;
2265                 while ((pos = find_raw(other, pos)) != -1)
2266                 {
2267                         uchar16_t* start = array + pos + other_size - 1;
2268                         uchar16_t* ptr   = array + used;
2269                         uchar16_t* end   = array + used + delta;
2270
2271                         // Shift characters to make room for the string.
2272                         while (ptr != start)
2273                         {
2274                                 *end = *ptr;
2275                                 --ptr;
2276                                 --end;
2277                         }
2278
2279                         // Add the new string now.
2280                         for (u32 i = 0; i < replace_size; ++i)
2281                                 array[pos + i] = replace[i];
2282
2283                         pos += replace_size;
2284                         used += delta;
2285                 }
2286
2287                 // Terminate the string and return ourself.
2288                 array[used] = 0;
2289                 return *this;
2290         }
2291
2292
2293         //! Removes characters from a ustring16..
2294         //! \param c The character to remove.
2295         //! \return A reference to our current string.
2296         ustring16<TAlloc>& remove(uchar32_t c)
2297         {
2298                 u32 pos = 0;
2299                 u32 found = 0;
2300                 u32 len = (c > 0xFFFF ? 2 : 1);         // Remove characters equal to the size of c as a UTF-16 character.
2301                 for (u32 i=0; i<=used; ++i)
2302                 {
2303                         uchar32_t uc32 = 0;
2304                         if (!UTF16_IS_SURROGATE_HI(array[i]))
2305                                 uc32 |= array[i];
2306                         else if (i + 1 <= used)
2307                         {
2308                                 // Convert the surrogate pair into a single UTF-32 character.
2309                                 uc32 = unicode::toUTF32(array[i], array[i + 1]);
2310                         }
2311                         u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2312
2313                         if (uc32 == c)
2314                         {
2315                                 found += len;
2316                                 continue;
2317                         }
2318
2319                         array[pos++] = array[i];
2320                         if (len2 == 2)
2321                                 array[pos++] = array[++i];
2322                 }
2323                 used -= found;
2324                 array[used] = 0;
2325                 return *this;
2326         }
2327
2328
2329         //! Removes a ustring16 from the ustring16.
2330         //! \param toRemove The string to remove.
2331         //! \return A reference to our current string.
2332         ustring16<TAlloc>& remove(const ustring16<TAlloc>& toRemove)
2333         {
2334                 u32 size = toRemove.size_raw();
2335                 if (size == 0) return *this;
2336
2337                 const uchar16_t* tra = toRemove.c_str();
2338                 u32 pos = 0;
2339                 u32 found = 0;
2340                 for (u32 i=0; i<=used; ++i)
2341                 {
2342                         u32 j = 0;
2343                         while (j < size)
2344                         {
2345                                 if (array[i + j] != tra[j])
2346                                         break;
2347                                 ++j;
2348                         }
2349                         if (j == size)
2350                         {
2351                                 found += size;
2352                                 i += size - 1;
2353                                 continue;
2354                         }
2355
2356                         array[pos++] = array[i];
2357                 }
2358                 used -= found;
2359                 array[used] = 0;
2360                 return *this;
2361         }
2362
2363
2364         //! Removes characters from the ustring16.
2365         //! \param characters The characters to remove.
2366         //! \return A reference to our current string.
2367         ustring16<TAlloc>& removeChars(const ustring16<TAlloc>& characters)
2368         {
2369                 if (characters.size_raw() == 0)
2370                         return *this;
2371
2372                 u32 pos = 0;
2373                 u32 found = 0;
2374                 const_iterator iter(characters);
2375                 for (u32 i=0; i<=used; ++i)
2376                 {
2377                         uchar32_t uc32 = 0;
2378                         if (!UTF16_IS_SURROGATE_HI(array[i]))
2379                                 uc32 |= array[i];
2380                         else if (i + 1 <= used)
2381                         {
2382                                 // Convert the surrogate pair into a single UTF-32 character.
2383                                 uc32 = unicode::toUTF32(array[i], array[i+1]);
2384                         }
2385                         u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2386
2387                         bool cont = false;
2388                         iter.toStart();
2389                         while (!iter.atEnd())
2390                         {
2391                                 uchar32_t c = *iter;
2392                                 if (uc32 == c)
2393                                 {
2394                                         found += (c > 0xFFFF ? 2 : 1);          // Remove characters equal to the size of c as a UTF-16 character.
2395                                         ++i;
2396                                         cont = true;
2397                                         break;
2398                                 }
2399                                 ++iter;
2400                         }
2401                         if (cont) continue;
2402
2403                         array[pos++] = array[i];
2404                         if (len2 == 2)
2405                                 array[pos++] = array[++i];
2406                 }
2407                 used -= found;
2408                 array[used] = 0;
2409                 return *this;
2410         }
2411
2412
2413         //! Trims the ustring16.
2414         //! Removes the specified characters (by default, Latin-1 whitespace) from the begining and the end of the ustring16.
2415         //! \param whitespace The characters that are to be considered as whitespace.
2416         //! \return A reference to our current string.
2417         ustring16<TAlloc>& trim(const ustring16<TAlloc>& whitespace = " \t\n\r")
2418         {
2419                 core::array<uchar32_t> utf32white = whitespace.toUTF32();
2420
2421                 // find start and end of the substring without the specified characters
2422                 const s32 begin = findFirstCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2423                 if (begin == -1)
2424                         return (*this="");
2425
2426                 const s32 end = findLastCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2427
2428                 return (*this = subString(begin, (end +1) - begin));
2429         }
2430
2431
2432         //! Erases a character from the ustring16.
2433         //! May be slow, because all elements following after the erased element have to be copied.
2434         //! \param index Index of element to be erased.
2435         //! \return A reference to our current string.
2436         ustring16<TAlloc>& erase(u32 index)
2437         {
2438                 _IRR_DEBUG_BREAK_IF(index>used) // access violation
2439
2440                 iterator i(*this, index);
2441
2442                 uchar32_t t = *i;
2443                 u32 len = (t > 0xFFFF ? 2 : 1);
2444
2445                 for (u32 j = static_cast<u32>(i.getPos()) + len; j <= used; ++j)
2446                         array[j - len] = array[j];
2447
2448                 used -= len;
2449                 array[used] = 0;
2450
2451                 return *this;
2452         }
2453
2454
2455         //! Validate the existing ustring16, checking for valid surrogate pairs and checking for proper termination.
2456         //! \return A reference to our current string.
2457         ustring16<TAlloc>& validate()
2458         {
2459                 // Validate all unicode characters.
2460                 for (u32 i=0; i<allocated; ++i)
2461                 {
2462                         // Terminate on existing null.
2463                         if (array[i] == 0)
2464                         {
2465                                 used = i;
2466                                 return *this;
2467                         }
2468                         if (UTF16_IS_SURROGATE(array[i]))
2469                         {
2470                                 if (((i+1) >= allocated) || UTF16_IS_SURROGATE_LO(array[i]))
2471                                         array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2472                                 else if (UTF16_IS_SURROGATE_HI(array[i]) && !UTF16_IS_SURROGATE_LO(array[i+1]))
2473                                         array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2474                                 ++i;
2475                         }
2476                         if (array[i] >= 0xFDD0 && array[i] <= 0xFDEF)
2477                                 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2478                 }
2479
2480                 // terminate
2481                 used = 0;
2482                 if (allocated > 0)
2483                 {
2484                         used = allocated - 1;
2485                         array[used] = 0;
2486                 }
2487                 return *this;
2488         }
2489
2490
2491         //! Gets the last char of the ustring16, or 0.
2492         //! \return The last char of the ustring16, or 0.
2493         uchar32_t lastChar() const
2494         {
2495                 if (used < 1)
2496                         return 0;
2497
2498                 if (UTF16_IS_SURROGATE_LO(array[used-1]))
2499                 {
2500                         // Make sure we have a paired surrogate.
2501                         if (used < 2)
2502                                 return 0;
2503
2504                         // Check for an invalid surrogate.
2505                         if (!UTF16_IS_SURROGATE_HI(array[used-2]))
2506                                 return 0;
2507
2508                         // Convert the surrogate pair into a single UTF-32 character.
2509                         return unicode::toUTF32(array[used-2], array[used-1]);
2510                 }
2511                 else
2512                 {
2513                         return array[used-1];
2514                 }
2515         }
2516
2517
2518         //! Split the ustring16 into parts.
2519         /** This method will split a ustring16 at certain delimiter characters
2520         into the container passed in as reference. The type of the container
2521         has to be given as template parameter. It must provide a push_back and
2522         a size method.
2523         \param ret The result container
2524         \param c C-style ustring16 of delimiter characters
2525         \param count Number of delimiter characters
2526         \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2527         container. If two delimiters occur without a character in between, an
2528         empty substring would be placed in the result. If this flag is set,
2529         only non-empty strings are stored.
2530         \param keepSeparators Flag which allows to add the separator to the
2531         result ustring16. If this flag is true, the concatenation of the
2532         substrings results in the original ustring16. Otherwise, only the
2533         characters between the delimiters are returned.
2534         \return The number of resulting substrings
2535         */
2536         template<class container>
2537         u32 split(container& ret, const uchar32_t* const c, u32 count=1, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2538         {
2539                 if (!c)
2540                         return 0;
2541
2542                 const_iterator i(*this);
2543                 const u32 oldSize=ret.size();
2544                 u32 pos = 0;
2545                 u32 lastpos = 0;
2546                 u32 lastpospos = 0;
2547                 bool lastWasSeparator = false;
2548                 while (!i.atEnd())
2549                 {
2550                         uchar32_t ch = *i;
2551                         bool foundSeparator = false;
2552                         for (u32 j=0; j<count; ++j)
2553                         {
2554                                 if (ch == c[j])
2555                                 {
2556                                         if ((!ignoreEmptyTokens || pos - lastpos != 0) &&
2557                                                         !lastWasSeparator)
2558                                         ret.push_back(ustring16<TAlloc>(&array[lastpospos], pos - lastpos));
2559                                         foundSeparator = true;
2560                                         lastpos = (keepSeparators ? pos : pos + 1);
2561                                         lastpospos = (keepSeparators ? i.getPos() : i.getPos() + 1);
2562                                         break;
2563                                 }
2564                         }
2565                         lastWasSeparator = foundSeparator;
2566                         ++pos;
2567                         ++i;
2568                 }
2569                 u32 s = size() + 1;
2570                 if (s > lastpos)
2571                         ret.push_back(ustring16<TAlloc>(&array[lastpospos], s - lastpos));
2572                 return ret.size()-oldSize;
2573         }
2574
2575
2576         //! Split the ustring16 into parts.
2577         /** This method will split a ustring16 at certain delimiter characters
2578         into the container passed in as reference. The type of the container
2579         has to be given as template parameter. It must provide a push_back and
2580         a size method.
2581         \param ret The result container
2582         \param c A unicode string of delimiter characters
2583         \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2584         container. If two delimiters occur without a character in between, an
2585         empty substring would be placed in the result. If this flag is set,
2586         only non-empty strings are stored.
2587         \param keepSeparators Flag which allows to add the separator to the
2588         result ustring16. If this flag is true, the concatenation of the
2589         substrings results in the original ustring16. Otherwise, only the
2590         characters between the delimiters are returned.
2591         \return The number of resulting substrings
2592         */
2593         template<class container>
2594         u32 split(container& ret, const ustring16<TAlloc>& c, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2595         {
2596                 core::array<uchar32_t> v = c.toUTF32();
2597                 return split(ret, v.pointer(), v.size(), ignoreEmptyTokens, keepSeparators);
2598         }
2599
2600
2601         //! Gets the size of the allocated memory buffer for the string.
2602         //! \return The size of the allocated memory buffer.
2603         u32 capacity() const
2604         {
2605                 return allocated;
2606         }
2607
2608
2609         //! Returns the raw number of UTF-16 code points in the string which includes the individual surrogates.
2610         //! \return The raw number of UTF-16 code points, excluding the trialing NUL.
2611         u32 size_raw() const
2612         {
2613                 return used;
2614         }
2615
2616
2617         //! Inserts a character into the string.
2618         //! \param c The character to insert.
2619         //! \param pos The position to insert the character.
2620         //! \return A reference to our current string.
2621         ustring16<TAlloc>& insert(uchar32_t c, u32 pos)
2622         {
2623                 u8 len = (c > 0xFFFF ? 2 : 1);
2624
2625                 if (used + len >= allocated)
2626                         reallocate(used + len);
2627
2628                 used += len;
2629
2630                 iterator iter(*this, pos);
2631                 for (u32 i = used - 2; i > iter.getPos(); --i)
2632                         array[i] = array[i - len];
2633
2634                 if (c > 0xFFFF)
2635                 {
2636                         // c will be multibyte, so split it up into a surrogate pair.
2637                         uchar16_t x = static_cast<uchar16_t>(c);
2638                         uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
2639                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
2640                         array[iter.getPos()] = vh;
2641                         array[iter.getPos()+1] = vl;
2642                 }
2643                 else
2644                 {
2645                         array[iter.getPos()] = static_cast<uchar16_t>(c);
2646                 }
2647                 array[used] = 0;
2648                 return *this;
2649         }
2650
2651
2652         //! Inserts a string into the string.
2653         //! \param c The string to insert.
2654         //! \param pos The position to insert the string.
2655         //! \return A reference to our current string.
2656         ustring16<TAlloc>& insert(const ustring16<TAlloc>& c, u32 pos)
2657         {
2658                 u32 len = c.size_raw();
2659                 if (len == 0) return *this;
2660
2661                 if (used + len >= allocated)
2662                         reallocate(used + len);
2663
2664                 used += len;
2665
2666                 iterator iter(*this, pos);
2667                 for (u32 i = used - 2; i > iter.getPos() + len; --i)
2668                         array[i] = array[i - len];
2669
2670                 const uchar16_t* s = c.c_str();
2671                 for (u32 i = 0; i < len; ++i)
2672                 {
2673                         array[pos++] = *s;
2674                         ++s;
2675                 }
2676
2677                 array[used] = 0;
2678                 return *this;
2679         }
2680
2681
2682         //! Inserts a character into the string.
2683         //! \param c The character to insert.
2684         //! \param pos The position to insert the character.
2685         //! \return A reference to our current string.
2686         ustring16<TAlloc>& insert_raw(uchar16_t c, u32 pos)
2687         {
2688                 if (used + 1 >= allocated)
2689                         reallocate(used + 1);
2690
2691                 ++used;
2692
2693                 for (u32 i = used - 1; i > pos; --i)
2694                         array[i] = array[i - 1];
2695
2696                 array[pos] = c;
2697                 array[used] = 0;
2698                 return *this;
2699         }
2700
2701
2702         //! Removes a character from string.
2703         //! \param pos Position of the character to remove.
2704         //! \return A reference to our current string.
2705         ustring16<TAlloc>& erase_raw(u32 pos)
2706         {
2707                 for (u32 i=pos; i<=used; ++i)
2708                 {
2709                         array[i] = array[i + 1];
2710                 }
2711                 --used;
2712                 array[used] = 0;
2713                 return *this;
2714         }
2715
2716
2717         //! Replaces a character in the string.
2718         //! \param c The new character.
2719         //! \param pos The position of the character to replace.
2720         //! \return A reference to our current string.
2721         ustring16<TAlloc>& replace_raw(uchar16_t c, u32 pos)
2722         {
2723                 array[pos] = c;
2724                 return *this;
2725         }
2726
2727
2728         //! Returns an iterator to the beginning of the string.
2729         //! \return An iterator to the beginning of the string.
2730         iterator begin()
2731         {
2732                 iterator i(*this, 0);
2733                 return i;
2734         }
2735
2736
2737         //! Returns an iterator to the beginning of the string.
2738         //! \return An iterator to the beginning of the string.
2739         const_iterator begin() const
2740         {
2741                 const_iterator i(*this, 0);
2742                 return i;
2743         }
2744
2745
2746         //! Returns an iterator to the beginning of the string.
2747         //! \return An iterator to the beginning of the string.
2748         const_iterator cbegin() const
2749         {
2750                 const_iterator i(*this, 0);
2751                 return i;
2752         }
2753
2754
2755         //! Returns an iterator to the end of the string.
2756         //! \return An iterator to the end of the string.
2757         iterator end()
2758         {
2759                 iterator i(*this, 0);
2760                 i.toEnd();
2761                 return i;
2762         }
2763
2764
2765         //! Returns an iterator to the end of the string.
2766         //! \return An iterator to the end of the string.
2767         const_iterator end() const
2768         {
2769                 const_iterator i(*this, 0);
2770                 i.toEnd();
2771                 return i;
2772         }
2773
2774
2775         //! Returns an iterator to the end of the string.
2776         //! \return An iterator to the end of the string.
2777         const_iterator cend() const
2778         {
2779                 const_iterator i(*this, 0);
2780                 i.toEnd();
2781                 return i;
2782         }
2783
2784
2785         //! Converts the string to a UTF-8 encoded string.
2786         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2787         //! \return A string containing the UTF-8 encoded string.
2788         core::string<uchar8_t> toUTF8_s(const bool addBOM = false) const
2789         {
2790                 core::string<uchar8_t> ret;
2791                 ret.reserve(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2792                 const_iterator iter(*this, 0);
2793
2794                 // Add the byte order mark if the user wants it.
2795                 if (addBOM)
2796                 {
2797                         ret.append(unicode::BOM_ENCODE_UTF8[0]);
2798                         ret.append(unicode::BOM_ENCODE_UTF8[1]);
2799                         ret.append(unicode::BOM_ENCODE_UTF8[2]);
2800                 }
2801
2802                 while (!iter.atEnd())
2803                 {
2804                         uchar32_t c = *iter;
2805                         if (c > 0xFFFF)
2806                         {       // 4 bytes
2807                                 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2808                                 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2809                                 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2810                                 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2811                                 ret.append(b1);
2812                                 ret.append(b2);
2813                                 ret.append(b3);
2814                                 ret.append(b4);
2815                         }
2816                         else if (c > 0x7FF)
2817                         {       // 3 bytes
2818                                 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2819                                 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2820                                 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2821                                 ret.append(b1);
2822                                 ret.append(b2);
2823                                 ret.append(b3);
2824                         }
2825                         else if (c > 0x7F)
2826                         {       // 2 bytes
2827                                 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2828                                 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2829                                 ret.append(b1);
2830                                 ret.append(b2);
2831                         }
2832                         else
2833                         {       // 1 byte
2834                                 ret.append(static_cast<uchar8_t>(c));
2835                         }
2836                         ++iter;
2837                 }
2838                 return ret;
2839         }
2840
2841
2842         //! Converts the string to a UTF-8 encoded string array.
2843         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2844         //! \return An array containing the UTF-8 encoded string.
2845         core::array<uchar8_t> toUTF8(const bool addBOM = false) const
2846         {
2847                 core::array<uchar8_t> ret(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2848                 const_iterator iter(*this, 0);
2849
2850                 // Add the byte order mark if the user wants it.
2851                 if (addBOM)
2852                 {
2853                         ret.push_back(unicode::BOM_ENCODE_UTF8[0]);
2854                         ret.push_back(unicode::BOM_ENCODE_UTF8[1]);
2855                         ret.push_back(unicode::BOM_ENCODE_UTF8[2]);
2856                 }
2857
2858                 while (!iter.atEnd())
2859                 {
2860                         uchar32_t c = *iter;
2861                         if (c > 0xFFFF)
2862                         {       // 4 bytes
2863                                 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2864                                 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2865                                 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2866                                 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2867                                 ret.push_back(b1);
2868                                 ret.push_back(b2);
2869                                 ret.push_back(b3);
2870                                 ret.push_back(b4);
2871                         }
2872                         else if (c > 0x7FF)
2873                         {       // 3 bytes
2874                                 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2875                                 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2876                                 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2877                                 ret.push_back(b1);
2878                                 ret.push_back(b2);
2879                                 ret.push_back(b3);
2880                         }
2881                         else if (c > 0x7F)
2882                         {       // 2 bytes
2883                                 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2884                                 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2885                                 ret.push_back(b1);
2886                                 ret.push_back(b2);
2887                         }
2888                         else
2889                         {       // 1 byte
2890                                 ret.push_back(static_cast<uchar8_t>(c));
2891                         }
2892                         ++iter;
2893                 }
2894                 ret.push_back(0);
2895                 return ret;
2896         }
2897
2898
2899 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
2900         //! Converts the string to a UTF-16 encoded string.
2901         //! \param endian The desired endianness of the string.
2902         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2903         //! \return A string containing the UTF-16 encoded string.
2904         core::string<char16_t> toUTF16_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2905         {
2906                 core::string<char16_t> ret;
2907                 ret.reserve(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2908
2909                 // Add the BOM if specified.
2910                 if (addBOM)
2911                 {
2912                         if (endian == unicode::EUTFEE_NATIVE)
2913                                 ret[0] = unicode::BOM;
2914                         else if (endian == unicode::EUTFEE_LITTLE)
2915                         {
2916                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ret.c_str());
2917                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2918                                 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2919                         }
2920                         else
2921                         {
2922                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ret.c_str());
2923                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2924                                 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2925                         }
2926                 }
2927
2928                 ret.append(array);
2929                 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2930                 {
2931                         char16_t* ptr = ret.c_str();
2932                         for (u32 i = 0; i < ret.size(); ++i)
2933                                 *ptr++ = unicode::swapEndian16(*ptr);
2934                 }
2935                 return ret;
2936         }
2937 #endif
2938
2939
2940         //! Converts the string to a UTF-16 encoded string array.
2941         //! Unfortunately, no toUTF16_s() version exists due to limitations with Irrlicht's string class.
2942         //! \param endian The desired endianness of the string.
2943         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2944         //! \return An array containing the UTF-16 encoded string.
2945         core::array<uchar16_t> toUTF16(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2946         {
2947                 core::array<uchar16_t> ret(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2948                 uchar16_t* ptr = ret.pointer();
2949
2950                 // Add the BOM if specified.
2951                 if (addBOM)
2952                 {
2953                         if (endian == unicode::EUTFEE_NATIVE)
2954                                 *ptr = unicode::BOM;
2955                         else if (endian == unicode::EUTFEE_LITTLE)
2956                         {
2957                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2958                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2959                                 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2960                         }
2961                         else
2962                         {
2963                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2964                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2965                                 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2966                         }
2967                         ++ptr;
2968                 }
2969
2970                 memcpy((void*)ptr, (void*)array, used * sizeof(uchar16_t));
2971                 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2972                 {
2973                         for (u32 i = 0; i <= used; ++i)
2974                                 ptr[i] = unicode::swapEndian16(ptr[i]);
2975                 }
2976                 ret.set_used(used + (addBOM ? unicode::BOM_UTF16_LEN : 0));
2977                 ret.push_back(0);
2978                 return ret;
2979         }
2980
2981
2982 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
2983         //! Converts the string to a UTF-32 encoded string.
2984         //! \param endian The desired endianness of the string.
2985         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2986         //! \return A string containing the UTF-32 encoded string.
2987         core::string<char32_t> toUTF32_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2988         {
2989                 core::string<char32_t> ret;
2990                 ret.reserve(size() + 1 + (addBOM ? unicode::BOM_UTF32_LEN : 0));
2991                 const_iterator iter(*this, 0);
2992
2993                 // Add the BOM if specified.
2994                 if (addBOM)
2995                 {
2996                         if (endian == unicode::EUTFEE_NATIVE)
2997                                 ret.append(unicode::BOM);
2998                         else
2999                         {
3000                                 union
3001                                 {
3002                                         uchar32_t full;
3003                                         u8 chunk[4];
3004                                 } t;
3005
3006                                 if (endian == unicode::EUTFEE_LITTLE)
3007                                 {
3008                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
3009                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
3010                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
3011                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
3012                                 }
3013                                 else
3014                                 {
3015                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
3016                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
3017                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
3018                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
3019                                 }
3020                                 ret.append(t.full);
3021                         }
3022                 }
3023
3024                 while (!iter.atEnd())
3025                 {
3026                         uchar32_t c = *iter;
3027                         if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
3028                                 c = unicode::swapEndian32(c);
3029                         ret.append(c);
3030                         ++iter;
3031                 }
3032                 return ret;
3033         }
3034 #endif
3035
3036
3037         //! Converts the string to a UTF-32 encoded string array.
3038         //! Unfortunately, no toUTF32_s() version exists due to limitations with Irrlicht's string class.
3039         //! \param endian The desired endianness of the string.
3040         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3041         //! \return An array containing the UTF-32 encoded string.
3042         core::array<uchar32_t> toUTF32(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3043         {
3044                 core::array<uchar32_t> ret(size() + (addBOM ? unicode::BOM_UTF32_LEN : 0) + 1);
3045                 const_iterator iter(*this, 0);
3046
3047                 // Add the BOM if specified.
3048                 if (addBOM)
3049                 {
3050                         if (endian == unicode::EUTFEE_NATIVE)
3051                                 ret.push_back(unicode::BOM);
3052                         else
3053                         {
3054                                 union
3055                                 {
3056                                         uchar32_t full;
3057                                         u8 chunk[4];
3058                                 } t;
3059
3060                                 if (endian == unicode::EUTFEE_LITTLE)
3061                                 {
3062                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
3063                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
3064                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
3065                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
3066                                 }
3067                                 else
3068                                 {
3069                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
3070                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
3071                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
3072                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
3073                                 }
3074                                 ret.push_back(t.full);
3075                         }
3076                 }
3077                 ret.push_back(0);
3078
3079                 while (!iter.atEnd())
3080                 {
3081                         uchar32_t c = *iter;
3082                         if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
3083                                 c = unicode::swapEndian32(c);
3084                         ret.push_back(c);
3085                         ++iter;
3086                 }
3087                 return ret;
3088         }
3089
3090
3091         //! Converts the string to a wchar_t encoded string.
3092         /** The size of a wchar_t changes depending on the platform.  This function will store a
3093         correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
3094         //! \param endian The desired endianness of the string.
3095         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3096         //! \return A string containing the wchar_t encoded string.
3097         core::string<wchar_t> toWCHAR_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3098         {
3099                 if (sizeof(wchar_t) == 4)
3100                 {
3101                         core::array<uchar32_t> a(toUTF32(endian, addBOM));
3102                         core::stringw ret(a.pointer());
3103                         return ret;
3104                 }
3105                 else if (sizeof(wchar_t) == 2)
3106                 {
3107                         if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
3108                         {
3109                                 core::stringw ret(array);
3110                                 return ret;
3111                         }
3112                         else
3113                         {
3114                                 core::array<uchar16_t> a(toUTF16(endian, addBOM));
3115                                 core::stringw ret(a.pointer());
3116                                 return ret;
3117                         }
3118                 }
3119                 else if (sizeof(wchar_t) == 1)
3120                 {
3121                         core::array<uchar8_t> a(toUTF8(addBOM));
3122                         core::stringw ret(a.pointer());
3123                         return ret;
3124                 }
3125
3126                 // Shouldn't happen.
3127                 return core::stringw();
3128         }
3129
3130
3131         //! Converts the string to a wchar_t encoded string array.
3132         /** The size of a wchar_t changes depending on the platform.  This function will store a
3133         correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
3134         //! \param endian The desired endianness of the string.
3135         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3136         //! \return An array containing the wchar_t encoded string.
3137         core::array<wchar_t> toWCHAR(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3138         {
3139                 if (sizeof(wchar_t) == 4)
3140                 {
3141                         core::array<uchar32_t> a(toUTF32(endian, addBOM));
3142                         core::array<wchar_t> ret(a.size());
3143                         ret.set_used(a.size());
3144                         memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar32_t));
3145                         return ret;
3146                 }
3147                 if (sizeof(wchar_t) == 2)
3148                 {
3149                         if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
3150                         {
3151                                 core::array<wchar_t> ret(used);
3152                                 ret.set_used(used);
3153                                 memcpy((void*)ret.pointer(), (void*)array, used * sizeof(uchar16_t));
3154                                 return ret;
3155                         }
3156                         else
3157                         {
3158                                 core::array<uchar16_t> a(toUTF16(endian, addBOM));
3159                                 core::array<wchar_t> ret(a.size());
3160                                 ret.set_used(a.size());
3161                                 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar16_t));
3162                                 return ret;
3163                         }
3164                 }
3165                 if (sizeof(wchar_t) == 1)
3166                 {
3167                         core::array<uchar8_t> a(toUTF8(addBOM));
3168                         core::array<wchar_t> ret(a.size());
3169                         ret.set_used(a.size());
3170                         memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar8_t));
3171                         return ret;
3172                 }
3173
3174                 // Shouldn't happen.
3175                 return core::array<wchar_t>();
3176         }
3177
3178         //! Converts the string to a properly encoded io::path string.
3179         //! \param endian The desired endianness of the string.
3180         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3181         //! \return An io::path string containing the properly encoded string.
3182         io::path toPATH_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3183         {
3184 #if defined(_IRR_WCHAR_FILESYSTEM)
3185                 return toWCHAR_s(endian, addBOM);
3186 #else
3187                 return toUTF8_s(addBOM);
3188 #endif
3189         }
3190
3191         //! Loads an unknown stream of data.
3192         //! Will attempt to determine if the stream is unicode data.  Useful for loading from files.
3193         //! \param data The data stream to load from.
3194         //! \param data_size The length of the data string.
3195         //! \return A reference to our current string.
3196         ustring16<TAlloc>& loadDataStream(const char* data, size_t data_size)
3197         {
3198                 // Clear our string.
3199                 *this = "";
3200                 if (!data)
3201                         return *this;
3202
3203                 unicode::EUTF_ENCODE e = unicode::determineUnicodeBOM(data);
3204                 switch (e)
3205                 {
3206                         default:
3207                         case unicode::EUTFE_UTF8:
3208                                 append((uchar8_t*)data, data_size);
3209                                 break;
3210
3211                         case unicode::EUTFE_UTF16:
3212                         case unicode::EUTFE_UTF16_BE:
3213                         case unicode::EUTFE_UTF16_LE:
3214                                 append((uchar16_t*)data, data_size / 2);
3215                                 break;
3216
3217                         case unicode::EUTFE_UTF32:
3218                         case unicode::EUTFE_UTF32_BE:
3219                         case unicode::EUTFE_UTF32_LE:
3220                                 append((uchar32_t*)data, data_size / 4);
3221                                 break;
3222                 }
3223
3224                 return *this;
3225         }
3226
3227         //! Gets the encoding of the Unicode string this class contains.
3228         //! \return An enum describing the current encoding of this string.
3229         const unicode::EUTF_ENCODE getEncoding() const
3230         {
3231                 return encoding;
3232         }
3233
3234         //! Gets the endianness of the Unicode string this class contains.
3235         //! \return An enum describing the endianness of this string.
3236         const unicode::EUTF_ENDIAN getEndianness() const
3237         {
3238                 if (encoding == unicode::EUTFE_UTF16_LE ||
3239                         encoding == unicode::EUTFE_UTF32_LE)
3240                         return unicode::EUTFEE_LITTLE;
3241                 else return unicode::EUTFEE_BIG;
3242         }
3243
3244 private:
3245
3246         //! Reallocate the string, making it bigger or smaller.
3247         //! \param new_size The new size of the string.
3248         void reallocate(u32 new_size)
3249         {
3250                 uchar16_t* old_array = array;
3251
3252                 array = allocator.allocate(new_size + 1); //new u16[new_size];
3253                 allocated = new_size + 1;
3254                 if (old_array == 0) return;
3255
3256                 u32 amount = used < new_size ? used : new_size;
3257                 for (u32 i=0; i<=amount; ++i)
3258                         array[i] = old_array[i];
3259
3260                 if (allocated <= used)
3261                         used = allocated - 1;
3262
3263                 array[used] = 0;
3264
3265                 allocator.deallocate(old_array); // delete [] old_array;
3266         }
3267
3268         //--- member variables
3269
3270         uchar16_t* array;
3271         unicode::EUTF_ENCODE encoding;
3272         u32 allocated;
3273         u32 used;
3274         TAlloc allocator;
3275         //irrAllocator<uchar16_t> allocator;
3276 };
3277
3278 typedef ustring16<irrAllocator<uchar16_t> > ustring;
3279
3280
3281 //! Appends two ustring16s.
3282 template <typename TAlloc>
3283 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const ustring16<TAlloc>& right)
3284 {
3285         ustring16<TAlloc> ret(left);
3286         ret += right;
3287         return ret;
3288 }
3289
3290
3291 //! Appends a ustring16 and a null-terminated unicode string.
3292 template <typename TAlloc, class B>
3293 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const B* const right)
3294 {
3295         ustring16<TAlloc> ret(left);
3296         ret += right;
3297         return ret;
3298 }
3299
3300
3301 //! Appends a ustring16 and a null-terminated unicode string.
3302 template <class B, typename TAlloc>
3303 inline ustring16<TAlloc> operator+(const B* const left, const ustring16<TAlloc>& right)
3304 {
3305         ustring16<TAlloc> ret(left);
3306         ret += right;
3307         return ret;
3308 }
3309
3310
3311 //! Appends a ustring16 and an Irrlicht string.
3312 template <typename TAlloc, typename B, typename BAlloc>
3313 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const string<B, BAlloc>& right)
3314 {
3315         ustring16<TAlloc> ret(left);
3316         ret += right;
3317         return ret;
3318 }
3319
3320
3321 //! Appends a ustring16 and an Irrlicht string.
3322 template <typename TAlloc, typename B, typename BAlloc>
3323 inline ustring16<TAlloc> operator+(const string<B, BAlloc>& left, const ustring16<TAlloc>& right)
3324 {
3325         ustring16<TAlloc> ret(left);
3326         ret += right;
3327         return ret;
3328 }
3329
3330
3331 //! Appends a ustring16 and a std::basic_string.
3332 template <typename TAlloc, typename B, typename A, typename BAlloc>
3333 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const std::basic_string<B, A, BAlloc>& right)
3334 {
3335         ustring16<TAlloc> ret(left);
3336         ret += right;
3337         return ret;
3338 }
3339
3340
3341 //! Appends a ustring16 and a std::basic_string.
3342 template <typename TAlloc, typename B, typename A, typename BAlloc>
3343 inline ustring16<TAlloc> operator+(const std::basic_string<B, A, BAlloc>& left, const ustring16<TAlloc>& right)
3344 {
3345         ustring16<TAlloc> ret(left);
3346         ret += right;
3347         return ret;
3348 }
3349
3350
3351 //! Appends a ustring16 and a char.
3352 template <typename TAlloc>
3353 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const char right)
3354 {
3355         ustring16<TAlloc> ret(left);
3356         ret += right;
3357         return ret;
3358 }
3359
3360
3361 //! Appends a ustring16 and a char.
3362 template <typename TAlloc>
3363 inline ustring16<TAlloc> operator+(const char left, const ustring16<TAlloc>& right)
3364 {
3365         ustring16<TAlloc> ret(left);
3366         ret += right;
3367         return ret;
3368 }
3369
3370
3371 #ifdef USTRING_CPP0X_NEWLITERALS
3372 //! Appends a ustring16 and a uchar32_t.
3373 template <typename TAlloc>
3374 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const uchar32_t right)
3375 {
3376         ustring16<TAlloc> ret(left);
3377         ret += right;
3378         return ret;
3379 }
3380
3381
3382 //! Appends a ustring16 and a uchar32_t.
3383 template <typename TAlloc>
3384 inline ustring16<TAlloc> operator+(const uchar32_t left, const ustring16<TAlloc>& right)
3385 {
3386         ustring16<TAlloc> ret(left);
3387         ret += right;
3388         return ret;
3389 }
3390 #endif
3391
3392
3393 //! Appends a ustring16 and a short.
3394 template <typename TAlloc>
3395 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const short right)
3396 {
3397         ustring16<TAlloc> ret(left);
3398         ret += core::stringc(right);
3399         return ret;
3400 }
3401
3402
3403 //! Appends a ustring16 and a short.
3404 template <typename TAlloc>
3405 inline ustring16<TAlloc> operator+(const short left, const ustring16<TAlloc>& right)
3406 {
3407         ustring16<TAlloc> ret((core::stringc(left)));
3408         ret += right;
3409         return ret;
3410 }
3411
3412
3413 //! Appends a ustring16 and an unsigned short.
3414 template <typename TAlloc>
3415 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned short right)
3416 {
3417         ustring16<TAlloc> ret(left);
3418         ret += core::stringc(right);
3419         return ret;
3420 }
3421
3422
3423 //! Appends a ustring16 and an unsigned short.
3424 template <typename TAlloc>
3425 inline ustring16<TAlloc> operator+(const unsigned short left, const ustring16<TAlloc>& right)
3426 {
3427         ustring16<TAlloc> ret((core::stringc(left)));
3428         ret += right;
3429         return ret;
3430 }
3431
3432
3433 //! Appends a ustring16 and an int.
3434 template <typename TAlloc>
3435 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const int right)
3436 {
3437         ustring16<TAlloc> ret(left);
3438         ret += core::stringc(right);
3439         return ret;
3440 }
3441
3442
3443 //! Appends a ustring16 and an int.
3444 template <typename TAlloc>
3445 inline ustring16<TAlloc> operator+(const int left, const ustring16<TAlloc>& right)
3446 {
3447         ustring16<TAlloc> ret((core::stringc(left)));
3448         ret += right;
3449         return ret;
3450 }
3451
3452
3453 //! Appends a ustring16 and an unsigned int.
3454 template <typename TAlloc>
3455 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned int right)
3456 {
3457         ustring16<TAlloc> ret(left);
3458         ret += core::stringc(right);
3459         return ret;
3460 }
3461
3462
3463 //! Appends a ustring16 and an unsigned int.
3464 template <typename TAlloc>
3465 inline ustring16<TAlloc> operator+(const unsigned int left, const ustring16<TAlloc>& right)
3466 {
3467         ustring16<TAlloc> ret((core::stringc(left)));
3468         ret += right;
3469         return ret;
3470 }
3471
3472
3473 //! Appends a ustring16 and a long.
3474 template <typename TAlloc>
3475 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const long right)
3476 {
3477         ustring16<TAlloc> ret(left);
3478         ret += core::stringc(right);
3479         return ret;
3480 }
3481
3482
3483 //! Appends a ustring16 and a long.
3484 template <typename TAlloc>
3485 inline ustring16<TAlloc> operator+(const long left, const ustring16<TAlloc>& right)
3486 {
3487         ustring16<TAlloc> ret((core::stringc(left)));
3488         ret += right;
3489         return ret;
3490 }
3491
3492
3493 //! Appends a ustring16 and an unsigned long.
3494 template <typename TAlloc>
3495 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned long right)
3496 {
3497         ustring16<TAlloc> ret(left);
3498         ret += core::stringc(right);
3499         return ret;
3500 }
3501
3502
3503 //! Appends a ustring16 and an unsigned long.
3504 template <typename TAlloc>
3505 inline ustring16<TAlloc> operator+(const unsigned long left, const ustring16<TAlloc>& right)
3506 {
3507         ustring16<TAlloc> ret((core::stringc(left)));
3508         ret += right;
3509         return ret;
3510 }
3511
3512
3513 //! Appends a ustring16 and a float.
3514 template <typename TAlloc>
3515 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const float right)
3516 {
3517         ustring16<TAlloc> ret(left);
3518         ret += core::stringc(right);
3519         return ret;
3520 }
3521
3522
3523 //! Appends a ustring16 and a float.
3524 template <typename TAlloc>
3525 inline ustring16<TAlloc> operator+(const float left, const ustring16<TAlloc>& right)
3526 {
3527         ustring16<TAlloc> ret((core::stringc(left)));
3528         ret += right;
3529         return ret;
3530 }
3531
3532
3533 //! Appends a ustring16 and a double.
3534 template <typename TAlloc>
3535 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const double right)
3536 {
3537         ustring16<TAlloc> ret(left);
3538         ret += core::stringc(right);
3539         return ret;
3540 }
3541
3542
3543 //! Appends a ustring16 and a double.
3544 template <typename TAlloc>
3545 inline ustring16<TAlloc> operator+(const double left, const ustring16<TAlloc>& right)
3546 {
3547         ustring16<TAlloc> ret((core::stringc(left)));
3548         ret += right;
3549         return ret;
3550 }
3551
3552
3553 #ifdef USTRING_CPP0X
3554 //! Appends two ustring16s.
3555 template <typename TAlloc>
3556 inline ustring16<TAlloc>&& operator+(const ustring16<TAlloc>& left, ustring16<TAlloc>&& right)
3557 {
3558         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3559         right.insert(left, 0);
3560         return std::move(right);
3561 }
3562
3563
3564 //! Appends two ustring16s.
3565 template <typename TAlloc>
3566 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const ustring16<TAlloc>& right)
3567 {
3568         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3569         left.append(right);
3570         return std::move(left);
3571 }
3572
3573
3574 //! Appends two ustring16s.
3575 template <typename TAlloc>
3576 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, ustring16<TAlloc>&& right)
3577 {
3578         //std::cout << "MOVE operator+(&&, &&)" << std::endl;
3579         if ((right.size_raw() <= left.capacity() - left.size_raw()) ||
3580                 (right.capacity() - right.size_raw() < left.size_raw()))
3581         {
3582                 left.append(right);
3583                 return std::move(left);
3584         }
3585         else
3586         {
3587                 right.insert(left, 0);
3588                 return std::move(right);
3589         }
3590 }
3591
3592
3593 //! Appends a ustring16 and a null-terminated unicode string.
3594 template <typename TAlloc, class B>
3595 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const B* const right)
3596 {
3597         //std::cout << "MOVE operator+(&&, B*)" << std::endl;
3598         left.append(right);
3599         return std::move(left);
3600 }
3601
3602
3603 //! Appends a ustring16 and a null-terminated unicode string.
3604 template <class B, typename TAlloc>
3605 inline ustring16<TAlloc>&& operator+(const B* const left, ustring16<TAlloc>&& right)
3606 {
3607         //std::cout << "MOVE operator+(B*, &&)" << std::endl;
3608         right.insert(left, 0);
3609         return std::move(right);
3610 }
3611
3612
3613 //! Appends a ustring16 and an Irrlicht string.
3614 template <typename TAlloc, typename B, typename BAlloc>
3615 inline ustring16<TAlloc>&& operator+(const string<B, BAlloc>& left, ustring16<TAlloc>&& right)
3616 {
3617         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3618         right.insert(left, 0);
3619         return std::move(right);
3620 }
3621
3622
3623 //! Appends a ustring16 and an Irrlicht string.
3624 template <typename TAlloc, typename B, typename BAlloc>
3625 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const string<B, BAlloc>& right)
3626 {
3627         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3628         left.append(right);
3629         return std::move(left);
3630 }
3631
3632
3633 //! Appends a ustring16 and a std::basic_string.
3634 template <typename TAlloc, typename B, typename A, typename BAlloc>
3635 inline ustring16<TAlloc>&& operator+(const std::basic_string<B, A, BAlloc>& left, ustring16<TAlloc>&& right)
3636 {
3637         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3638         right.insert(core::ustring16<TAlloc>(left), 0);
3639         return std::move(right);
3640 }
3641
3642
3643 //! Appends a ustring16 and a std::basic_string.
3644 template <typename TAlloc, typename B, typename A, typename BAlloc>
3645 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const std::basic_string<B, A, BAlloc>& right)
3646 {
3647         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3648         left.append(right);
3649         return std::move(left);
3650 }
3651
3652
3653 //! Appends a ustring16 and a char.
3654 template <typename TAlloc>
3655 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const char right)
3656 {
3657         left.append((uchar32_t)right);
3658         return std::move(left);
3659 }
3660
3661
3662 //! Appends a ustring16 and a char.
3663 template <typename TAlloc>
3664 inline ustring16<TAlloc> operator+(const char left, ustring16<TAlloc>&& right)
3665 {
3666         right.insert((uchar32_t)left, 0);
3667         return std::move(right);
3668 }
3669
3670
3671 #ifdef USTRING_CPP0X_NEWLITERALS
3672 //! Appends a ustring16 and a uchar32_t.
3673 template <typename TAlloc>
3674 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const uchar32_t right)
3675 {
3676         left.append(right);
3677         return std::move(left);
3678 }
3679
3680
3681 //! Appends a ustring16 and a uchar32_t.
3682 template <typename TAlloc>
3683 inline ustring16<TAlloc> operator+(const uchar32_t left, ustring16<TAlloc>&& right)
3684 {
3685         right.insert(left, 0);
3686         return std::move(right);
3687 }
3688 #endif
3689
3690
3691 //! Appends a ustring16 and a short.
3692 template <typename TAlloc>
3693 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const short right)
3694 {
3695         left.append(core::stringc(right));
3696         return std::move(left);
3697 }
3698
3699
3700 //! Appends a ustring16 and a short.
3701 template <typename TAlloc>
3702 inline ustring16<TAlloc> operator+(const short left, ustring16<TAlloc>&& right)
3703 {
3704         right.insert(core::stringc(left), 0);
3705         return std::move(right);
3706 }
3707
3708
3709 //! Appends a ustring16 and an unsigned short.
3710 template <typename TAlloc>
3711 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned short right)
3712 {
3713         left.append(core::stringc(right));
3714         return std::move(left);
3715 }
3716
3717
3718 //! Appends a ustring16 and an unsigned short.
3719 template <typename TAlloc>
3720 inline ustring16<TAlloc> operator+(const unsigned short left, ustring16<TAlloc>&& right)
3721 {
3722         right.insert(core::stringc(left), 0);
3723         return std::move(right);
3724 }
3725
3726
3727 //! Appends a ustring16 and an int.
3728 template <typename TAlloc>
3729 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const int right)
3730 {
3731         left.append(core::stringc(right));
3732         return std::move(left);
3733 }
3734
3735
3736 //! Appends a ustring16 and an int.
3737 template <typename TAlloc>
3738 inline ustring16<TAlloc> operator+(const int left, ustring16<TAlloc>&& right)
3739 {
3740         right.insert(core::stringc(left), 0);
3741         return std::move(right);
3742 }
3743
3744
3745 //! Appends a ustring16 and an unsigned int.
3746 template <typename TAlloc>
3747 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned int right)
3748 {
3749         left.append(core::stringc(right));
3750         return std::move(left);
3751 }
3752
3753
3754 //! Appends a ustring16 and an unsigned int.
3755 template <typename TAlloc>
3756 inline ustring16<TAlloc> operator+(const unsigned int left, ustring16<TAlloc>&& right)
3757 {
3758         right.insert(core::stringc(left), 0);
3759         return std::move(right);
3760 }
3761
3762
3763 //! Appends a ustring16 and a long.
3764 template <typename TAlloc>
3765 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const long right)
3766 {
3767         left.append(core::stringc(right));
3768         return std::move(left);
3769 }
3770
3771
3772 //! Appends a ustring16 and a long.
3773 template <typename TAlloc>
3774 inline ustring16<TAlloc> operator+(const long left, ustring16<TAlloc>&& right)
3775 {
3776         right.insert(core::stringc(left), 0);
3777         return std::move(right);
3778 }
3779
3780
3781 //! Appends a ustring16 and an unsigned long.
3782 template <typename TAlloc>
3783 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned long right)
3784 {
3785         left.append(core::stringc(right));
3786         return std::move(left);
3787 }
3788
3789
3790 //! Appends a ustring16 and an unsigned long.
3791 template <typename TAlloc>
3792 inline ustring16<TAlloc> operator+(const unsigned long left, ustring16<TAlloc>&& right)
3793 {
3794         right.insert(core::stringc(left), 0);
3795         return std::move(right);
3796 }
3797
3798
3799 //! Appends a ustring16 and a float.
3800 template <typename TAlloc>
3801 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const float right)
3802 {
3803         left.append(core::stringc(right));
3804         return std::move(left);
3805 }
3806
3807
3808 //! Appends a ustring16 and a float.
3809 template <typename TAlloc>
3810 inline ustring16<TAlloc> operator+(const float left, ustring16<TAlloc>&& right)
3811 {
3812         right.insert(core::stringc(left), 0);
3813         return std::move(right);
3814 }
3815
3816
3817 //! Appends a ustring16 and a double.
3818 template <typename TAlloc>
3819 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const double right)
3820 {
3821         left.append(core::stringc(right));
3822         return std::move(left);
3823 }
3824
3825
3826 //! Appends a ustring16 and a double.
3827 template <typename TAlloc>
3828 inline ustring16<TAlloc> operator+(const double left, ustring16<TAlloc>&& right)
3829 {
3830         right.insert(core::stringc(left), 0);
3831         return std::move(right);
3832 }
3833 #endif
3834
3835
3836 #ifndef USTRING_NO_STL
3837 //! Writes a ustring16 to an ostream.
3838 template <typename TAlloc>
3839 inline std::ostream& operator<<(std::ostream& out, const ustring16<TAlloc>& in)
3840 {
3841         out << in.toUTF8_s().c_str();
3842         return out;
3843 }
3844
3845 //! Writes a ustring16 to a wostream.
3846 template <typename TAlloc>
3847 inline std::wostream& operator<<(std::wostream& out, const ustring16<TAlloc>& in)
3848 {
3849         out << in.toWCHAR_s().c_str();
3850         return out;
3851 }
3852 #endif
3853
3854
3855 #ifndef USTRING_NO_STL
3856
3857 namespace unicode
3858 {
3859
3860 //! Hashing algorithm for hashing a ustring.  Used for things like unordered_maps.
3861 //! Algorithm taken from std::hash<std::string>.
3862 class hash : public std::unary_function<core::ustring, size_t>
3863 {
3864         public:
3865                 size_t operator()(const core::ustring& s) const
3866                 {
3867                         size_t ret = 2166136261U;
3868                         size_t index = 0;
3869                         size_t stride = 1 + s.size_raw() / 10;
3870
3871                         core::ustring::const_iterator i = s.begin();
3872                         while (i != s.end())
3873                         {
3874                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
3875                                 ret = 16777619U * ret ^ (size_t)s[(u32)index];
3876                                 index += stride;
3877                                 i += stride;
3878                         }
3879                         return (ret);
3880                 }
3881 };
3882
3883 } // end namespace unicode
3884
3885 #endif
3886
3887 } // end namespace core
3888 } // end namespace irr
3889
3890 #endif