src/cguittfont/irrUString.h

   1 /*
   2    Basic Unicode string class for Irrlicht.
   3    Copyright (c) 2009-2011 John Norman
   4
   5    This software is provided 'as-is', without any express or implied
   6    warranty. In no event will the authors be held liable for any
   7    damages arising from the use of this software.
   8
   9    Permission is granted to anyone to use this software for any
  10    purpose, including commercial applications, and to alter it and
  11    redistribute it freely, subject to the following restrictions:
  12
  13    1. The origin of this software must not be misrepresented; you
  14       must not claim that you wrote the original software. If you use
  15       this software in a product, an acknowledgment in the product
  16       documentation would be appreciated but is not required.
  17
  18    2. Altered source versions must be plainly marked as such, and
  19       must not be misrepresented as being the original software.
  20
  21    3. This notice may not be removed or altered from any source
  22       distribution.
  23
  24    The original version of this class can be located at:
  25    http://irrlicht.suckerfreegames.com/
  26
  27    John Norman
  28    john@suckerfreegames.com
  29 */
  30
  31 #ifndef __IRR_USTRING_H_INCLUDED__
  32 #define __IRR_USTRING_H_INCLUDED__
  33
  34 #if (__cplusplus > 199711L) || (_MSC_VER >= 1600) || defined(__GXX_EXPERIMENTAL_CXX0X__)
  35 #       define USTRING_CPP0X
  36 #       if defined(__GXX_EXPERIMENTAL_CXX0X__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
  37 #               define USTRING_CPP0X_NEWLITERALS
  38 #       endif
  39 #endif
  40
  41 #include <stdio.h>
  42 #include <string.h>
  43 #include <stdlib.h>
  44 #ifdef _WIN32
  45 #define __BYTE_ORDER 0
  46 #define __LITTLE_ENDIAN 0
  47 #define __BIG_ENDIAN 1
  48 #elif defined(__MACH__) && defined(__APPLE__)
  49 #include <machine/endian.h>
  50 #elif defined(__FreeBSD__)
  51 #include <sys/endian.h>
  52 #else
  53 #include <endian.h>
  54 #endif
  55
  56 #ifdef USTRING_CPP0X
  57 #       include <utility>
  58 #endif
  59
  60 #ifndef USTRING_NO_STL
  61 #       include <string>
  62 #       include <iterator>
  63 #       include <ostream>
  64 #endif
  65
  66 #include "irrTypes.h"
  67 #include "irrAllocator.h"
  68 #include "irrArray.h"
  69 #include "irrMath.h"
  70 #include "irrString.h"
  71 #include "path.h"
  72
  73 //! UTF-16 surrogate start values.
  74 static const irr::u16 UTF16_HI_SURROGATE = 0xD800;
  75 static const irr::u16 UTF16_LO_SURROGATE = 0xDC00;
  76
  77 //! Is a UTF-16 code point a surrogate?
  78 #define UTF16_IS_SURROGATE(c)           (((c) & 0xF800) == 0xD800)
  79 #define UTF16_IS_SURROGATE_HI(c)        (((c) & 0xFC00) == 0xD800)
  80 #define UTF16_IS_SURROGATE_LO(c)        (((c) & 0xFC00) == 0xDC00)
  81
  82
  83 namespace irr
  84 {
  85
  86         // Define our character types.
  87 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
  88         typedef char32_t uchar32_t;
  89         typedef char16_t uchar16_t;
  90         typedef char uchar8_t;
  91 #else
  92         typedef u32 uchar32_t;
  93         typedef u16 uchar16_t;
  94         typedef u8 uchar8_t;
  95 #endif
  96
  97 namespace core
  98 {
  99
 100 namespace unicode
 101 {
 102
 103 //! The unicode replacement character.  Used to replace invalid characters.
 104 const irr::u16 UTF_REPLACEMENT_CHARACTER = 0xFFFD;
 105
 106 //! Convert a UTF-16 surrogate pair into a UTF-32 character.
 107 //! \param high The high value of the pair.
 108 //! \param low The low value of the pair.
 109 //! \return The UTF-32 character expressed by the surrogate pair.
 110 inline uchar32_t toUTF32(uchar16_t high, uchar16_t low)
 111 {
 112         // Convert the surrogate pair into a single UTF-32 character.
 113         uchar32_t x = ((high & ((1 << 6) -1)) << 10) | (low & ((1 << 10) -1));
 114         uchar32_t wu = ((high >> 6) & ((1 << 5) - 1)) + 1;
 115         return (wu << 16) | x;
 116 }
 117
 118 //! Swaps the endianness of a 16-bit value.
 119 //! \return The new value.
 120 inline uchar16_t swapEndian16(const uchar16_t& c)
 121 {
 122         return ((c >> 8) & 0x00FF) | ((c << 8) & 0xFF00);
 123 }
 124
 125 //! Swaps the endianness of a 32-bit value.
 126 //! \return The new value.
 127 inline uchar32_t swapEndian32(const uchar32_t& c)
 128 {
 129         return  ((c >> 24) & 0x000000FF) |
 130                         ((c >> 8)  & 0x0000FF00) |
 131                         ((c << 8)  & 0x00FF0000) |
 132                         ((c << 24) & 0xFF000000);
 133 }
 134
 135 //! The Unicode byte order mark.
 136 const u16 BOM = 0xFEFF;
 137
 138 //! The size of the Unicode byte order mark in terms of the Unicode character size.
 139 const u8 BOM_UTF8_LEN = 3;
 140 const u8 BOM_UTF16_LEN = 1;
 141 const u8 BOM_UTF32_LEN = 1;
 142
 143 //! Unicode byte order marks for file operations.
 144 const u8 BOM_ENCODE_UTF8[3] = { 0xEF, 0xBB, 0xBF };
 145 const u8 BOM_ENCODE_UTF16_BE[2] = { 0xFE, 0xFF };
 146 const u8 BOM_ENCODE_UTF16_LE[2] = { 0xFF, 0xFE };
 147 const u8 BOM_ENCODE_UTF32_BE[4] = { 0x00, 0x00, 0xFE, 0xFF };
 148 const u8 BOM_ENCODE_UTF32_LE[4] = { 0xFF, 0xFE, 0x00, 0x00 };
 149
 150 //! The size in bytes of the Unicode byte marks for file operations.
 151 const u8 BOM_ENCODE_UTF8_LEN = 3;
 152 const u8 BOM_ENCODE_UTF16_LEN = 2;
 153 const u8 BOM_ENCODE_UTF32_LEN = 4;
 154
 155 //! Unicode encoding type.
 156 enum EUTF_ENCODE
 157 {
 158         EUTFE_NONE              = 0,
 159         EUTFE_UTF8,
 160         EUTFE_UTF16,
 161         EUTFE_UTF16_LE,
 162         EUTFE_UTF16_BE,
 163         EUTFE_UTF32,
 164         EUTFE_UTF32_LE,
 165         EUTFE_UTF32_BE
 166 };
 167
 168 //! Unicode endianness.
 169 enum EUTF_ENDIAN
 170 {
 171         EUTFEE_NATIVE   = 0,
 172         EUTFEE_LITTLE,
 173         EUTFEE_BIG
 174 };
 175
 176 //! Returns the specified unicode byte order mark in a byte array.
 177 //! The byte order mark is the first few bytes in a text file that signifies its encoding.
 178 /** \param mode The Unicode encoding method that we want to get the byte order mark for.
 179                 If EUTFE_UTF16 or EUTFE_UTF32 is passed, it uses the native system endianness. **/
 180 //! \return An array that contains a byte order mark.
 181 inline core::array<u8> getUnicodeBOM(EUTF_ENCODE mode)
 182 {
 183 #define COPY_ARRAY(source, size) \
 184         memcpy(ret.pointer(), source, size); \
 185         ret.set_used(size)
 186
 187         core::array<u8> ret(4);
 188         switch (mode)
 189         {
 190                 case EUTFE_UTF8:
 191                         COPY_ARRAY(BOM_ENCODE_UTF8, BOM_ENCODE_UTF8_LEN);
 192                         break;
 193                 case EUTFE_UTF16:
 194                         #ifdef __BIG_ENDIAN__
 195                                 COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
 196                         #else
 197                                 COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
 198                         #endif
 199                         break;
 200                 case EUTFE_UTF16_BE:
 201                         COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
 202                         break;
 203                 case EUTFE_UTF16_LE:
 204                         COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
 205                         break;
 206                 case EUTFE_UTF32:
 207                         #ifdef __BIG_ENDIAN__
 208                                 COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
 209                         #else
 210                                 COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
 211                         #endif
 212                         break;
 213                 case EUTFE_UTF32_BE:
 214                         COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
 215                         break;
 216                 case EUTFE_UTF32_LE:
 217                         COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
 218                         break;
 219                 case EUTFE_NONE:
 220                         // TODO sapier: fixed warning only,
 221                         // don't know if something needs to be done here
 222                         break;
 223         }
 224         return ret;
 225
 226 #undef COPY_ARRAY
 227 }
 228
 229 //! Detects if the given data stream starts with a unicode BOM.
 230 //! \param data The data stream to check.
 231 //! \return The unicode BOM associated with the data stream, or EUTFE_NONE if none was found.
 232 inline EUTF_ENCODE determineUnicodeBOM(const char* data)
 233 {
 234         if (memcmp(data, BOM_ENCODE_UTF8, 3) == 0) return EUTFE_UTF8;
 235         if (memcmp(data, BOM_ENCODE_UTF16_BE, 2) == 0) return EUTFE_UTF16_BE;
 236         if (memcmp(data, BOM_ENCODE_UTF16_LE, 2) == 0) return EUTFE_UTF16_LE;
 237         if (memcmp(data, BOM_ENCODE_UTF32_BE, 4) == 0) return EUTFE_UTF32_BE;
 238         if (memcmp(data, BOM_ENCODE_UTF32_LE, 4) == 0) return EUTFE_UTF32_LE;
 239         return EUTFE_NONE;
 240 }
 241
 242 } // end namespace unicode
 243
 244
 245 //! UTF-16 string class.
 246 template <typename TAlloc = irrAllocator<uchar16_t> >
 247 class ustring16
 248 {
 249 public:
 250
 251         ///------------------///
 252         /// iterator classes ///
 253         ///------------------///
 254
 255         //! Access an element in a unicode string, allowing one to change it.
 256         class _ustring16_iterator_access
 257         {
 258                 public:
 259                         _ustring16_iterator_access(const ustring16<TAlloc>* s, u32 p) : ref(s), pos(p) {}
 260
 261                         //! Allow the class to be interpreted as a single UTF-32 character.
 262                         operator uchar32_t() const
 263                         {
 264                                 return _get();
 265                         }
 266
 267                         //! Allow one to change the character in the unicode string.
 268                         //! \param c The new character to use.
 269                         //! \return Myself.
 270                         _ustring16_iterator_access& operator=(const uchar32_t c)
 271                         {
 272                                 _set(c);
 273                                 return *this;
 274                         }
 275
 276                         //! Increments the value by 1.
 277                         //! \return Myself.
 278                         _ustring16_iterator_access& operator++()
 279                         {
 280                                 _set(_get() + 1);
 281                                 return *this;
 282                         }
 283
 284                         //! Increments the value by 1, returning the old value.
 285                         //! \return A unicode character.
 286                         uchar32_t operator++(int)
 287                         {
 288                                 uchar32_t old = _get();
 289                                 _set(old + 1);
 290                                 return old;
 291                         }
 292
 293                         //! Decrements the value by 1.
 294                         //! \return Myself.
 295                         _ustring16_iterator_access& operator--()
 296                         {
 297                                 _set(_get() - 1);
 298                                 return *this;
 299                         }
 300
 301                         //! Decrements the value by 1, returning the old value.
 302                         //! \return A unicode character.
 303                         uchar32_t operator--(int)
 304                         {
 305                                 uchar32_t old = _get();
 306                                 _set(old - 1);
 307                                 return old;
 308                         }
 309
 310                         //! Adds to the value by a specified amount.
 311                         //! \param val The amount to add to this character.
 312                         //! \return Myself.
 313                         _ustring16_iterator_access& operator+=(int val)
 314                         {
 315                                 _set(_get() + val);
 316                                 return *this;
 317                         }
 318
 319                         //! Subtracts from the value by a specified amount.
 320                         //! \param val The amount to subtract from this character.
 321                         //! \return Myself.
 322                         _ustring16_iterator_access& operator-=(int val)
 323                         {
 324                                 _set(_get() - val);
 325                                 return *this;
 326                         }
 327
 328                         //! Multiples the value by a specified amount.
 329                         //! \param val The amount to multiply this character by.
 330                         //! \return Myself.
 331                         _ustring16_iterator_access& operator*=(int val)
 332                         {
 333                                 _set(_get() * val);
 334                                 return *this;
 335                         }
 336
 337                         //! Divides the value by a specified amount.
 338                         //! \param val The amount to divide this character by.
 339                         //! \return Myself.
 340                         _ustring16_iterator_access& operator/=(int val)
 341                         {
 342                                 _set(_get() / val);
 343                                 return *this;
 344                         }
 345
 346                         //! Modulos the value by a specified amount.
 347                         //! \param val The amount to modulo this character by.
 348                         //! \return Myself.
 349                         _ustring16_iterator_access& operator%=(int val)
 350                         {
 351                                 _set(_get() % val);
 352                                 return *this;
 353                         }
 354
 355                         //! Adds to the value by a specified amount.
 356                         //! \param val The amount to add to this character.
 357                         //! \return A unicode character.
 358                         uchar32_t operator+(int val) const
 359                         {
 360                                 return _get() + val;
 361                         }
 362
 363                         //! Subtracts from the value by a specified amount.
 364                         //! \param val The amount to subtract from this character.
 365                         //! \return A unicode character.
 366                         uchar32_t operator-(int val) const
 367                         {
 368                                 return _get() - val;
 369                         }
 370
 371                         //! Multiplies the value by a specified amount.
 372                         //! \param val The amount to multiply this character by.
 373                         //! \return A unicode character.
 374                         uchar32_t operator*(int val) const
 375                         {
 376                                 return _get() * val;
 377                         }
 378
 379                         //! Divides the value by a specified amount.
 380                         //! \param val The amount to divide this character by.
 381                         //! \return A unicode character.
 382                         uchar32_t operator/(int val) const
 383                         {
 384                                 return _get() / val;
 385                         }
 386
 387                         //! Modulos the value by a specified amount.
 388                         //! \param val The amount to modulo this character by.
 389                         //! \return A unicode character.
 390                         uchar32_t operator%(int val) const
 391                         {
 392                                 return _get() % val;
 393                         }
 394
 395                 private:
 396                         //! Gets a uchar32_t from our current position.
 397                         uchar32_t _get() const
 398                         {
 399                                 const uchar16_t* a = ref->c_str();
 400                                 if (!UTF16_IS_SURROGATE(a[pos]))
 401                                         return static_cast<uchar32_t>(a[pos]);
 402                                 else
 403                                 {
 404                                         if (pos + 1 >= ref->size_raw())
 405                                                 return 0;
 406
 407                                         return unicode::toUTF32(a[pos], a[pos + 1]);
 408                                 }
 409                         }
 410
 411                         //! Sets a uchar32_t at our current position.
 412                         void _set(uchar32_t c)
 413                         {
 414                                 ustring16<TAlloc>* ref2 = const_cast<ustring16<TAlloc>*>(ref);
 415                                 const uchar16_t* a = ref2->c_str();
 416                                 if (c > 0xFFFF)
 417                                 {
 418                                         // c will be multibyte, so split it up into the high and low surrogate pairs.
 419                                         uchar16_t x = static_cast<uchar16_t>(c);
 420                                         uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
 421                                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
 422
 423                                         // If the previous position was a surrogate pair, just replace them.  Else, insert the low pair.
 424                                         if (UTF16_IS_SURROGATE_HI(a[pos]) && pos + 1 != ref2->size_raw())
 425                                                 ref2->replace_raw(vl, static_cast<u32>(pos) + 1);
 426                                         else ref2->insert_raw(vl, static_cast<u32>(pos) + 1);
 427
 428                                         ref2->replace_raw(vh, static_cast<u32>(pos));
 429                                 }
 430                                 else
 431                                 {
 432                                         // c will be a single byte.
 433                                         uchar16_t vh = static_cast<uchar16_t>(c);
 434
 435                                         // If the previous position was a surrogate pair, remove the extra byte.
 436                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 437                                                 ref2->erase_raw(static_cast<u32>(pos) + 1);
 438
 439                                         ref2->replace_raw(vh, static_cast<u32>(pos));
 440                                 }
 441                         }
 442
 443                         const ustring16<TAlloc>* ref;
 444                         u32 pos;
 445         };
 446         typedef typename ustring16<TAlloc>::_ustring16_iterator_access access;
 447
 448
 449         //! Iterator to iterate through a UTF-16 string.
 450 #ifndef USTRING_NO_STL
 451         class _ustring16_const_iterator : public std::iterator<
 452                 std::bidirectional_iterator_tag,        // iterator_category
 453                 access,                                                         // value_type
 454                 ptrdiff_t,                                                      // difference_type
 455                 const access,                                           // pointer
 456                 const access                                            // reference
 457         >
 458 #else
 459         class _ustring16_const_iterator
 460 #endif
 461         {
 462                 public:
 463                         typedef _ustring16_const_iterator _Iter;
 464                         typedef std::iterator<std::bidirectional_iterator_tag, access, ptrdiff_t, const access, const access> _Base;
 465                         typedef const access const_pointer;
 466                         typedef const access const_reference;
 467
 468 #ifndef USTRING_NO_STL
 469                         typedef typename _Base::value_type value_type;
 470                         typedef typename _Base::difference_type difference_type;
 471                         typedef typename _Base::difference_type distance_type;
 472                         typedef typename _Base::pointer pointer;
 473                         typedef const_reference reference;
 474 #else
 475                         typedef access value_type;
 476                         typedef u32 difference_type;
 477                         typedef u32 distance_type;
 478                         typedef const_pointer pointer;
 479                         typedef const_reference reference;
 480 #endif
 481
 482                         //! Constructors.
 483                         _ustring16_const_iterator(const _Iter& i) : ref(i.ref), pos(i.pos) {}
 484                         _ustring16_const_iterator(const ustring16<TAlloc>& s) : ref(&s), pos(0) {}
 485                         _ustring16_const_iterator(const ustring16<TAlloc>& s, const u32 p) : ref(&s), pos(0)
 486                         {
 487                                 if (ref->size_raw() == 0 || p == 0)
 488                                         return;
 489
 490                                 // Go to the appropriate position.
 491                                 u32 i = p;
 492                                 u32 sr = ref->size_raw();
 493                                 const uchar16_t* a = ref->c_str();
 494                                 while (i != 0 && pos < sr)
 495                                 {
 496                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 497                                                 pos += 2;
 498                                         else ++pos;
 499                                         --i;
 500                                 }
 501                         }
 502
 503                         //! Test for equalness.
 504                         bool operator==(const _Iter& iter) const
 505                         {
 506                                 if (ref == iter.ref && pos == iter.pos)
 507                                         return true;
 508                                 return false;
 509                         }
 510
 511                         //! Test for unequalness.
 512                         bool operator!=(const _Iter& iter) const
 513                         {
 514                                 if (ref != iter.ref || pos != iter.pos)
 515                                         return true;
 516                                 return false;
 517                         }
 518
 519                         //! Switch to the next full character in the string.
 520                         _Iter& operator++()
 521                         {       // ++iterator
 522                                 if (pos == ref->size_raw()) return *this;
 523                                 const uchar16_t* a = ref->c_str();
 524                                 if (UTF16_IS_SURROGATE_HI(a[pos]))
 525                                         pos += 2;                       // TODO: check for valid low surrogate?
 526                                 else ++pos;
 527                                 if (pos > ref->size_raw()) pos = ref->size_raw();
 528                                 return *this;
 529                         }
 530
 531                         //! Switch to the next full character in the string, returning the previous position.
 532                         _Iter operator++(int)
 533                         {       // iterator++
 534                                 _Iter _tmp(*this);
 535                                 ++*this;
 536                                 return _tmp;
 537                         }
 538
 539                         //! Switch to the previous full character in the string.
 540                         _Iter& operator--()
 541                         {       // --iterator
 542                                 if (pos == 0) return *this;
 543                                 const uchar16_t* a = ref->c_str();
 544                                 --pos;
 545                                 if (UTF16_IS_SURROGATE_LO(a[pos]) && pos != 0)  // low surrogate, go back one more.
 546                                         --pos;
 547                                 return *this;
 548                         }
 549
 550                         //! Switch to the previous full character in the string, returning the previous position.
 551                         _Iter operator--(int)
 552                         {       // iterator--
 553                                 _Iter _tmp(*this);
 554                                 --*this;
 555                                 return _tmp;
 556                         }
 557
 558                         //! Advance a specified number of full characters in the string.
 559                         //! \return Myself.
 560                         _Iter& operator+=(const difference_type v)
 561                         {
 562                                 if (v == 0) return *this;
 563                                 if (v < 0) return operator-=(v * -1);
 564
 565                                 if (pos >= ref->size_raw())
 566                                         return *this;
 567
 568                                 // Go to the appropriate position.
 569                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
 570                                 u32 i = (u32)v;
 571                                 u32 sr = ref->size_raw();
 572                                 const uchar16_t* a = ref->c_str();
 573                                 while (i != 0 && pos < sr)
 574                                 {
 575                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 576                                                 pos += 2;
 577                                         else ++pos;
 578                                         --i;
 579                                 }
 580                                 if (pos > sr)
 581                                         pos = sr;
 582
 583                                 return *this;
 584                         }
 585
 586                         //! Go back a specified number of full characters in the string.
 587                         //! \return Myself.
 588                         _Iter& operator-=(const difference_type v)
 589                         {
 590                                 if (v == 0) return *this;
 591                                 if (v > 0) return operator+=(v * -1);
 592
 593                                 if (pos == 0)
 594                                         return *this;
 595
 596                                 // Go to the appropriate position.
 597                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
 598                                 u32 i = (u32)v;
 599                                 const uchar16_t* a = ref->c_str();
 600                                 while (i != 0 && pos != 0)
 601                                 {
 602                                         --pos;
 603                                         if (UTF16_IS_SURROGATE_LO(a[pos]) != 0 && pos != 0)
 604                                                 --pos;
 605                                         --i;
 606                                 }
 607
 608                                 return *this;
 609                         }
 610
 611                         //! Return a new iterator that is a variable number of full characters forward from the current position.
 612                         _Iter operator+(const difference_type v) const
 613                         {
 614                                 _Iter ret(*this);
 615                                 ret += v;
 616                                 return ret;
 617                         }
 618
 619                         //! Return a new iterator that is a variable number of full characters backward from the current position.
 620                         _Iter operator-(const difference_type v) const
 621                         {
 622                                 _Iter ret(*this);
 623                                 ret -= v;
 624                                 return ret;
 625                         }
 626
 627                         //! Returns the distance between two iterators.
 628                         difference_type operator-(const _Iter& iter) const
 629                         {
 630                                 // Make sure we reference the same object!
 631                                 if (ref != iter.ref)
 632                                         return difference_type();
 633
 634                                 _Iter i = iter;
 635                                 difference_type ret;
 636
 637                                 // Walk up.
 638                                 if (pos > i.pos)
 639                                 {
 640                                         while (pos > i.pos)
 641                                         {
 642                                                 ++i;
 643                                                 ++ret;
 644                                         }
 645                                         return ret;
 646                                 }
 647
 648                                 // Walk down.
 649                                 while (pos < i.pos)
 650                                 {
 651                                         --i;
 652                                         --ret;
 653                                 }
 654                                 return ret;
 655                         }
 656
 657                         //! Accesses the full character at the iterator's position.
 658                         const_reference operator*() const
 659                         {
 660                                 if (pos >= ref->size_raw())
 661                                 {
 662                                         const uchar16_t* a = ref->c_str();
 663                                         u32 p = ref->size_raw();
 664                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 665                                                 --p;
 666                                         reference ret(ref, p);
 667                                         return ret;
 668                                 }
 669                                 const_reference ret(ref, pos);
 670                                 return ret;
 671                         }
 672
 673                         //! Accesses the full character at the iterator's position.
 674                         reference operator*()
 675                         {
 676                                 if (pos >= ref->size_raw())
 677                                 {
 678                                         const uchar16_t* a = ref->c_str();
 679                                         u32 p = ref->size_raw();
 680                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 681                                                 --p;
 682                                         reference ret(ref, p);
 683                                         return ret;
 684                                 }
 685                                 reference ret(ref, pos);
 686                                 return ret;
 687                         }
 688
 689                         //! Accesses the full character at the iterator's position.
 690                         const_pointer operator->() const
 691                         {
 692                                 return operator*();
 693                         }
 694
 695                         //! Accesses the full character at the iterator's position.
 696                         pointer operator->()
 697                         {
 698                                 return operator*();
 699                         }
 700
 701                         //! Is the iterator at the start of the string?
 702                         bool atStart() const
 703                         {
 704                                 return pos == 0;
 705                         }
 706
 707                         //! Is the iterator at the end of the string?
 708                         bool atEnd() const
 709                         {
 710                                 const uchar16_t* a = ref->c_str();
 711                                 if (UTF16_IS_SURROGATE(a[pos]))
 712                                         return (pos + 1) >= ref->size_raw();
 713                                 else return pos >= ref->size_raw();
 714                         }
 715
 716                         //! Moves the iterator to the start of the string.
 717                         void toStart()
 718                         {
 719                                 pos = 0;
 720                         }
 721
 722                         //! Moves the iterator to the end of the string.
 723                         void toEnd()
 724                         {
 725                                 pos = ref->size_raw();
 726                         }
 727
 728                         //! Returns the iterator's position.
 729                         //! \return The iterator's position.
 730                         u32 getPos() const
 731                         {
 732                                 return pos;
 733                         }
 734
 735                 protected:
 736                         const ustring16<TAlloc>* ref;
 737                         u32 pos;
 738         };
 739
 740         //! Iterator to iterate through a UTF-16 string.
 741         class _ustring16_iterator : public _ustring16_const_iterator
 742         {
 743                 public:
 744                         typedef _ustring16_iterator _Iter;
 745                         typedef _ustring16_const_iterator _Base;
 746                         typedef typename _Base::const_pointer const_pointer;
 747                         typedef typename _Base::const_reference const_reference;
 748
 749
 750                         typedef typename _Base::value_type value_type;
 751                         typedef typename _Base::difference_type difference_type;
 752                         typedef typename _Base::distance_type distance_type;
 753                         typedef access pointer;
 754                         typedef access reference;
 755
 756                         using _Base::pos;
 757                         using _Base::ref;
 758
 759                         //! Constructors.
 760                         _ustring16_iterator(const _Iter& i) : _ustring16_const_iterator(i) {}
 761                         _ustring16_iterator(const ustring16<TAlloc>& s) : _ustring16_const_iterator(s) {}
 762                         _ustring16_iterator(const ustring16<TAlloc>& s, const u32 p) : _ustring16_const_iterator(s, p) {}
 763
 764                         //! Accesses the full character at the iterator's position.
 765                         reference operator*() const
 766                         {
 767                                 if (pos >= ref->size_raw())
 768                                 {
 769                                         const uchar16_t* a = ref->c_str();
 770                                         u32 p = ref->size_raw();
 771                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 772                                                 --p;
 773                                         reference ret(ref, p);
 774                                         return ret;
 775                                 }
 776                                 reference ret(ref, pos);
 777                                 return ret;
 778                         }
 779
 780                         //! Accesses the full character at the iterator's position.
 781                         reference operator*()
 782                         {
 783                                 if (pos >= ref->size_raw())
 784                                 {
 785                                         const uchar16_t* a = ref->c_str();
 786                                         u32 p = ref->size_raw();
 787                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 788                                                 --p;
 789                                         reference ret(ref, p);
 790                                         return ret;
 791                                 }
 792                                 reference ret(ref, pos);
 793                                 return ret;
 794                         }
 795
 796                         //! Accesses the full character at the iterator's position.
 797                         pointer operator->() const
 798                         {
 799                                 return operator*();
 800                         }
 801
 802                         //! Accesses the full character at the iterator's position.
 803                         pointer operator->()
 804                         {
 805                                 return operator*();
 806                         }
 807         };
 808
 809         typedef typename ustring16<TAlloc>::_ustring16_iterator iterator;
 810         typedef typename ustring16<TAlloc>::_ustring16_const_iterator const_iterator;
 811
 812         ///----------------------///
 813         /// end iterator classes ///
 814         ///----------------------///
 815
 816         //! Default constructor
 817         ustring16()
 818         : array(0), allocated(1), used(0)
 819         {
 820 #if __BYTE_ORDER == __BIG_ENDIAN
 821                 encoding = unicode::EUTFE_UTF16_BE;
 822 #else
 823                 encoding = unicode::EUTFE_UTF16_LE;
 824 #endif
 825                 array = allocator.allocate(1); // new u16[1];
 826                 array[0] = 0x0;
 827         }
 828
 829
 830         //! Constructor
 831         ustring16(const ustring16<TAlloc>& other)
 832         : array(0), allocated(0), used(0)
 833         {
 834 #if __BYTE_ORDER == __BIG_ENDIAN
 835                 encoding = unicode::EUTFE_UTF16_BE;
 836 #else
 837                 encoding = unicode::EUTFE_UTF16_LE;
 838 #endif
 839                 *this = other;
 840         }
 841
 842
 843         //! Constructor from other string types
 844         template <class B, class A>
 845         ustring16(const string<B, A>& other)
 846         : array(0), allocated(0), used(0)
 847         {
 848 #if __BYTE_ORDER == __BIG_ENDIAN
 849                 encoding = unicode::EUTFE_UTF16_BE;
 850 #else
 851                 encoding = unicode::EUTFE_UTF16_LE;
 852 #endif
 853                 *this = other;
 854         }
 855
 856
 857 #ifndef USTRING_NO_STL
 858         //! Constructor from std::string
 859         template <class B, class A, typename Alloc>
 860         ustring16(const std::basic_string<B, A, Alloc>& other)
 861         : array(0), allocated(0), used(0)
 862         {
 863 #if __BYTE_ORDER == __BIG_ENDIAN
 864                 encoding = unicode::EUTFE_UTF16_BE;
 865 #else
 866                 encoding = unicode::EUTFE_UTF16_LE;
 867 #endif
 868                 *this = other.c_str();
 869         }
 870
 871
 872         //! Constructor from iterator.
 873         template <typename Itr>
 874         ustring16(Itr first, Itr last)
 875         : array(0), allocated(0), used(0)
 876         {
 877 #if __BYTE_ORDER == __BIG_ENDIAN
 878                 encoding = unicode::EUTFE_UTF16_BE;
 879 #else
 880                 encoding = unicode::EUTFE_UTF16_LE;
 881 #endif
 882                 reserve(std::distance(first, last));
 883                 array[used] = 0;
 884
 885                 for (; first != last; ++first)
 886                         append((uchar32_t)*first);
 887         }
 888 #endif
 889
 890
 891 #ifndef USTRING_CPP0X_NEWLITERALS
 892         //! Constructor for copying a character string from a pointer.
 893         ustring16(const char* const c)
 894         : array(0), allocated(0), used(0)
 895         {
 896 #if __BYTE_ORDER == __BIG_ENDIAN
 897                 encoding = unicode::EUTFE_UTF16_BE;
 898 #else
 899                 encoding = unicode::EUTFE_UTF16_LE;
 900 #endif
 901
 902                 loadDataStream(c, strlen(c));
 903                 //append((uchar8_t*)c);
 904         }
 905
 906
 907         //! Constructor for copying a character string from a pointer with a given length.
 908         ustring16(const char* const c, u32 length)
 909         : array(0), allocated(0), used(0)
 910         {
 911 #if __BYTE_ORDER == __BIG_ENDIAN
 912                 encoding = unicode::EUTFE_UTF16_BE;
 913 #else
 914                 encoding = unicode::EUTFE_UTF16_LE;
 915 #endif
 916
 917                 loadDataStream(c, length);
 918         }
 919 #endif
 920
 921
 922         //! Constructor for copying a UTF-8 string from a pointer.
 923         ustring16(const uchar8_t* const c)
 924         : array(0), allocated(0), used(0)
 925         {
 926 #if __BYTE_ORDER == __BIG_ENDIAN
 927                 encoding = unicode::EUTFE_UTF16_BE;
 928 #else
 929                 encoding = unicode::EUTFE_UTF16_LE;
 930 #endif
 931
 932                 append(c);
 933         }
 934
 935
 936         //! Constructor for copying a UTF-8 string from a single char.
 937         ustring16(const char c)
 938         : array(0), allocated(0), used(0)
 939         {
 940 #if __BYTE_ORDER == __BIG_ENDIAN
 941                 encoding = unicode::EUTFE_UTF16_BE;
 942 #else
 943                 encoding = unicode::EUTFE_UTF16_LE;
 944 #endif
 945
 946                 append((uchar32_t)c);
 947         }
 948
 949
 950         //! Constructor for copying a UTF-8 string from a pointer with a given length.
 951         ustring16(const uchar8_t* const c, u32 length)
 952         : array(0), allocated(0), used(0)
 953         {
 954 #if __BYTE_ORDER == __BIG_ENDIAN
 955                 encoding = unicode::EUTFE_UTF16_BE;
 956 #else
 957                 encoding = unicode::EUTFE_UTF16_LE;
 958 #endif
 959
 960                 append(c, length);
 961         }
 962
 963
 964         //! Constructor for copying a UTF-16 string from a pointer.
 965         ustring16(const uchar16_t* const c)
 966         : array(0), allocated(0), used(0)
 967         {
 968 #if __BYTE_ORDER == __BIG_ENDIAN
 969                 encoding = unicode::EUTFE_UTF16_BE;
 970 #else
 971                 encoding = unicode::EUTFE_UTF16_LE;
 972 #endif
 973
 974                 append(c);
 975         }
 976
 977
 978         //! Constructor for copying a UTF-16 string from a pointer with a given length
 979         ustring16(const uchar16_t* const c, u32 length)
 980         : array(0), allocated(0), used(0)
 981         {
 982 #if __BYTE_ORDER == __BIG_ENDIAN
 983                 encoding = unicode::EUTFE_UTF16_BE;
 984 #else
 985                 encoding = unicode::EUTFE_UTF16_LE;
 986 #endif
 987
 988                 append(c, length);
 989         }
 990
 991
 992         //! Constructor for copying a UTF-32 string from a pointer.
 993         ustring16(const uchar32_t* const c)
 994         : array(0), allocated(0), used(0)
 995         {
 996 #if __BYTE_ORDER == __BIG_ENDIAN
 997                 encoding = unicode::EUTFE_UTF16_BE;
 998 #else
 999                 encoding = unicode::EUTFE_UTF16_LE;
1000 #endif
1001
1002                 append(c);
1003         }
1004
1005
1006         //! Constructor for copying a UTF-32 from a pointer with a given length.
1007         ustring16(const uchar32_t* const c, u32 length)
1008         : array(0), allocated(0), used(0)
1009         {
1010 #if __BYTE_ORDER == __BIG_ENDIAN
1011                 encoding = unicode::EUTFE_UTF16_BE;
1012 #else
1013                 encoding = unicode::EUTFE_UTF16_LE;
1014 #endif
1015
1016                 append(c, length);
1017         }
1018
1019
1020         //! Constructor for copying a wchar_t string from a pointer.
1021         ustring16(const wchar_t* const c)
1022         : array(0), allocated(0), used(0)
1023         {
1024 #if __BYTE_ORDER == __BIG_ENDIAN
1025                 encoding = unicode::EUTFE_UTF16_BE;
1026 #else
1027                 encoding = unicode::EUTFE_UTF16_LE;
1028 #endif
1029
1030                 if (sizeof(wchar_t) == 4)
1031                         append(reinterpret_cast<const uchar32_t* const>(c));
1032                 else if (sizeof(wchar_t) == 2)
1033                         append(reinterpret_cast<const uchar16_t* const>(c));
1034                 else if (sizeof(wchar_t) == 1)
1035                         append(reinterpret_cast<const uchar8_t* const>(c));
1036         }
1037
1038
1039         //! Constructor for copying a wchar_t string from a pointer with a given length.
1040         ustring16(const wchar_t* const c, u32 length)
1041         : array(0), allocated(0), used(0)
1042         {
1043 #if __BYTE_ORDER == __BIG_ENDIAN
1044                 encoding = unicode::EUTFE_UTF16_BE;
1045 #else
1046                 encoding = unicode::EUTFE_UTF16_LE;
1047 #endif
1048
1049                 if (sizeof(wchar_t) == 4)
1050                         append(reinterpret_cast<const uchar32_t* const>(c), length);
1051                 else if (sizeof(wchar_t) == 2)
1052                         append(reinterpret_cast<const uchar16_t* const>(c), length);
1053                 else if (sizeof(wchar_t) == 1)
1054                         append(reinterpret_cast<const uchar8_t* const>(c), length);
1055         }
1056
1057
1058 #ifdef USTRING_CPP0X
1059         //! Constructor for moving a ustring16
1060         ustring16(ustring16<TAlloc>&& other)
1061         : array(other.array), encoding(other.encoding), allocated(other.allocated), used(other.used)
1062         {
1063                 //std::cout << "MOVE constructor" << std::endl;
1064                 other.array = 0;
1065                 other.allocated = 0;
1066                 other.used = 0;
1067         }
1068 #endif
1069
1070
1071         //! Destructor
1072         ~ustring16()
1073         {
1074                 allocator.deallocate(array); // delete [] array;
1075         }
1076
1077
1078         //! Assignment operator
1079         ustring16& operator=(const ustring16<TAlloc>& other)
1080         {
1081                 if (this == &other)
1082                         return *this;
1083
1084                 used = other.size_raw();
1085                 if (used >= allocated)
1086                 {
1087                         allocator.deallocate(array); // delete [] array;
1088                         allocated = used + 1;
1089                         array = allocator.allocate(used + 1); //new u16[used];
1090                 }
1091
1092                 const uchar16_t* p = other.c_str();
1093                 for (u32 i=0; i<=used; ++i, ++p)
1094                         array[i] = *p;
1095
1096                 array[used] = 0;
1097
1098                 // Validate our new UTF-16 string.
1099                 validate();
1100
1101                 return *this;
1102         }
1103
1104
1105 #ifdef USTRING_CPP0X
1106         //! Move assignment operator
1107         ustring16& operator=(ustring16<TAlloc>&& other)
1108         {
1109                 if (this != &other)
1110                 {
1111                         //std::cout << "MOVE operator=" << std::endl;
1112                         allocator.deallocate(array);
1113
1114                         array = other.array;
1115                         allocated = other.allocated;
1116                         encoding = other.encoding;
1117                         used = other.used;
1118                         other.array = 0;
1119                         other.used = 0;
1120                 }
1121                 return *this;
1122         }
1123 #endif
1124
1125
1126         //! Assignment operator for other string types
1127         template <class B, class A>
1128         ustring16<TAlloc>& operator=(const string<B, A>& other)
1129         {
1130                 *this = other.c_str();
1131                 return *this;
1132         }
1133
1134
1135         //! Assignment operator for UTF-8 strings
1136         ustring16<TAlloc>& operator=(const uchar8_t* const c)
1137         {
1138                 if (!array)
1139                 {
1140                         array = allocator.allocate(1); //new u16[1];
1141                         allocated = 1;
1142                 }
1143                 used = 0;
1144                 array[used] = 0x0;
1145                 if (!c) return *this;
1146
1147                 //! Append our string now.
1148                 append(c);
1149                 return *this;
1150         }
1151
1152
1153         //! Assignment operator for UTF-16 strings
1154         ustring16<TAlloc>& operator=(const uchar16_t* const c)
1155         {
1156                 if (!array)
1157                 {
1158                         array = allocator.allocate(1); //new u16[1];
1159                         allocated = 1;
1160                 }
1161                 used = 0;
1162                 array[used] = 0x0;
1163                 if (!c) return *this;
1164
1165                 //! Append our string now.
1166                 append(c);
1167                 return *this;
1168         }
1169
1170
1171         //! Assignment operator for UTF-32 strings
1172         ustring16<TAlloc>& operator=(const uchar32_t* const c)
1173         {
1174                 if (!array)
1175                 {
1176                         array = allocator.allocate(1); //new u16[1];
1177                         allocated = 1;
1178                 }
1179                 used = 0;
1180                 array[used] = 0x0;
1181                 if (!c) return *this;
1182
1183                 //! Append our string now.
1184                 append(c);
1185                 return *this;
1186         }
1187
1188
1189         //! Assignment operator for wchar_t strings.
1190         /** Note that this assumes that a correct unicode string is stored in the wchar_t string.
1191                 Since wchar_t changes depending on its platform, it could either be a UTF-8, -16, or -32 string.
1192                 This function assumes you are storing the correct unicode encoding inside the wchar_t string. **/
1193         ustring16<TAlloc>& operator=(const wchar_t* const c)
1194         {
1195                 if (sizeof(wchar_t) == 4)
1196                         *this = reinterpret_cast<const uchar32_t* const>(c);
1197                 else if (sizeof(wchar_t) == 2)
1198                         *this = reinterpret_cast<const uchar16_t* const>(c);
1199                 else if (sizeof(wchar_t) == 1)
1200                         *this = reinterpret_cast<const uchar8_t* const>(c);
1201
1202                 return *this;
1203         }
1204
1205
1206         //! Assignment operator for other strings.
1207         /** Note that this assumes that a correct unicode string is stored in the string. **/
1208         template <class B>
1209         ustring16<TAlloc>& operator=(const B* const c)
1210         {
1211                 if (sizeof(B) == 4)
1212                         *this = reinterpret_cast<const uchar32_t* const>(c);
1213                 else if (sizeof(B) == 2)
1214                         *this = reinterpret_cast<const uchar16_t* const>(c);
1215                 else if (sizeof(B) == 1)
1216                         *this = reinterpret_cast<const uchar8_t* const>(c);
1217
1218                 return *this;
1219         }
1220
1221
1222         //! Direct access operator
1223         access operator [](const u32 index)
1224         {
1225                 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1226                 iterator iter(*this, index);
1227                 return iter.operator*();
1228         }
1229
1230
1231         //! Direct access operator
1232         const access operator [](const u32 index) const
1233         {
1234                 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1235                 const_iterator iter(*this, index);
1236                 return iter.operator*();
1237         }
1238
1239
1240         //! Equality operator
1241         bool operator ==(const uchar16_t* const str) const
1242         {
1243                 if (!str)
1244                         return false;
1245
1246                 u32 i;
1247                 for(i=0; array[i] && str[i]; ++i)
1248                         if (array[i] != str[i])
1249                                 return false;
1250
1251                 return !array[i] && !str[i];
1252         }
1253
1254
1255         //! Equality operator
1256         bool operator ==(const ustring16<TAlloc>& other) const
1257         {
1258                 for(u32 i=0; array[i] && other.array[i]; ++i)
1259                         if (array[i] != other.array[i])
1260                                 return false;
1261
1262                 return used == other.used;
1263         }
1264
1265
1266         //! Is smaller comparator
1267         bool operator <(const ustring16<TAlloc>& other) const
1268         {
1269                 for(u32 i=0; array[i] && other.array[i]; ++i)
1270                 {
1271                         s32 diff = array[i] - other.array[i];
1272                         if ( diff )
1273                                 return diff < 0;
1274                 }
1275
1276                 return used < other.used;
1277         }
1278
1279
1280         //! Inequality operator
1281         bool operator !=(const uchar16_t* const str) const
1282         {
1283                 return !(*this == str);
1284         }
1285
1286
1287         //! Inequality operator
1288         bool operator !=(const ustring16<TAlloc>& other) const
1289         {
1290                 return !(*this == other);
1291         }
1292
1293
1294         //! Returns the length of a ustring16 in full characters.
1295         //! \return Length of a ustring16 in full characters.
1296         u32 size() const
1297         {
1298                 const_iterator i(*this, 0);
1299                 u32 pos = 0;
1300                 while (!i.atEnd())
1301                 {
1302                         ++i;
1303                         ++pos;
1304                 }
1305                 return pos;
1306         }
1307
1308
1309         //! Informs if the ustring is empty or not.
1310         //! \return True if the ustring is empty, false if not.
1311         bool empty() const
1312         {
1313                 return (size_raw() == 0);
1314         }
1315
1316
1317         //! Returns a pointer to the raw UTF-16 string data.
1318         //! \return pointer to C-style NUL terminated array of UTF-16 code points.
1319         const uchar16_t* c_str() const
1320         {
1321                 return array;
1322         }
1323
1324
1325         //! Compares the first n characters of this string with another.
1326         //! \param other Other string to compare to.
1327         //! \param n Number of characters to compare.
1328         //! \return True if the n first characters of both strings are equal.
1329         bool equalsn(const ustring16<TAlloc>& other, u32 n) const
1330         {
1331                 u32 i;
1332                 const uchar16_t* oa = other.c_str();
1333                 for(i=0; array[i] && oa[i] && i < n; ++i)
1334                         if (array[i] != oa[i])
1335                                 return false;
1336
1337                 // if one (or both) of the strings was smaller then they
1338                 // are only equal if they have the same length
1339                 return (i == n) || (used == other.used);
1340         }
1341
1342
1343         //! Compares the first n characters of this string with another.
1344         //! \param str Other string to compare to.
1345         //! \param n Number of characters to compare.
1346         //! \return True if the n first characters of both strings are equal.
1347         bool equalsn(const uchar16_t* const str, u32 n) const
1348         {
1349                 if (!str)
1350                         return false;
1351                 u32 i;
1352                 for(i=0; array[i] && str[i] && i < n; ++i)
1353                         if (array[i] != str[i])
1354                                 return false;
1355
1356                 // if one (or both) of the strings was smaller then they
1357                 // are only equal if they have the same length
1358                 return (i == n) || (array[i] == 0 && str[i] == 0);
1359         }
1360
1361
1362         //! Appends a character to this ustring16
1363         //! \param character The character to append.
1364         //! \return A reference to our current string.
1365         ustring16<TAlloc>& append(uchar32_t character)
1366         {
1367                 if (used + 2 >= allocated)
1368                         reallocate(used + 2);
1369
1370                 if (character > 0xFFFF)
1371                 {
1372                         used += 2;
1373
1374                         // character will be multibyte, so split it up into a surrogate pair.
1375                         uchar16_t x = static_cast<uchar16_t>(character);
1376                         uchar16_t vh = UTF16_HI_SURROGATE | ((((character >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1377                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1378                         array[used-2] = vh;
1379                         array[used-1] = vl;
1380                 }
1381                 else
1382                 {
1383                         ++used;
1384                         array[used-1] = character;
1385                 }
1386                 array[used] = 0;
1387
1388                 return *this;
1389         }
1390
1391
1392         //! Appends a UTF-8 string to this ustring16
1393         //! \param other The UTF-8 string to append.
1394         //! \param length The length of the string to append.
1395         //! \return A reference to our current string.
1396         ustring16<TAlloc>& append(const uchar8_t* const other, u32 length=0xffffffff)
1397         {
1398                 if (!other)
1399                         return *this;
1400
1401                 // Determine if the string is long enough for a BOM.
1402                 u32 len = 0;
1403                 const uchar8_t* p = other;
1404                 do
1405                 {
1406                         ++len;
1407                 } while (*p++ && len < unicode::BOM_ENCODE_UTF8_LEN);
1408
1409                 // Check for BOM.
1410                 unicode::EUTF_ENCODE c_bom = unicode::EUTFE_NONE;
1411                 if (len == unicode::BOM_ENCODE_UTF8_LEN)
1412                 {
1413                         if (memcmp(other, unicode::BOM_ENCODE_UTF8, unicode::BOM_ENCODE_UTF8_LEN) == 0)
1414                                 c_bom = unicode::EUTFE_UTF8;
1415                 }
1416
1417                 // If a BOM was found, don't include it in the string.
1418                 const uchar8_t* c2 = other;
1419                 if (c_bom != unicode::EUTFE_NONE)
1420                 {
1421                         c2 = other + unicode::BOM_UTF8_LEN;
1422                         length -= unicode::BOM_UTF8_LEN;
1423                 }
1424
1425                 // Calculate the size of the string to read in.
1426                 len = 0;
1427                 p = c2;
1428                 do
1429                 {
1430                         ++len;
1431                 } while(*p++ && len < length);
1432                 if (len > length)
1433                         len = length;
1434
1435                 // If we need to grow the array, do it now.
1436                 if (used + len >= allocated)
1437                         reallocate(used + (len * 2));
1438                 u32 start = used;
1439
1440                 // Convert UTF-8 to UTF-16.
1441                 u32 pos = start;
1442                 for (u32 l = 0; l<len;)
1443                 {
1444                         ++used;
1445                         if (((c2[l] >> 6) & 0x03) == 0x02)
1446                         {       // Invalid continuation byte.
1447                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1448                                 ++l;
1449                         }
1450                         else if (c2[l] == 0xC0 || c2[l] == 0xC1)
1451                         {       // Invalid byte - overlong encoding.
1452                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1453                                 ++l;
1454                         }
1455                         else if ((c2[l] & 0xF8) == 0xF0)
1456                         {       // 4 bytes UTF-8, 2 bytes UTF-16.
1457                                 // Check for a full string.
1458                                 if ((l + 3) >= len)
1459                                 {
1460                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1461                                         l += 3;
1462                                         break;
1463                                 }
1464
1465                                 // Validate.
1466                                 bool valid = true;
1467                                 u8 l2 = 0;
1468                                 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1469                                 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1470                                 if (valid && (((c2[l+3] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1471                                 if (!valid)
1472                                 {
1473                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1474                                         l += l2;
1475                                         continue;
1476                                 }
1477
1478                                 // Decode.
1479                                 uchar8_t b1 = ((c2[l] & 0x7) << 2) | ((c2[l+1] >> 4) & 0x3);
1480                                 uchar8_t b2 = ((c2[l+1] & 0xF) << 4) | ((c2[l+2] >> 2) & 0xF);
1481                                 uchar8_t b3 = ((c2[l+2] & 0x3) << 6) | (c2[l+3] & 0x3F);
1482                                 uchar32_t v = b3 | ((uchar32_t)b2 << 8) | ((uchar32_t)b1 << 16);
1483
1484                                 // Split v up into a surrogate pair.
1485                                 uchar16_t x = static_cast<uchar16_t>(v);
1486                                 uchar16_t vh = UTF16_HI_SURROGATE | ((((v >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1487                                 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1488
1489                                 array[pos++] = vh;
1490                                 array[pos++] = vl;
1491                                 l += 4;
1492                                 ++used;         // Using two shorts this time, so increase used by 1.
1493                         }
1494                         else if ((c2[l] & 0xF0) == 0xE0)
1495                         {       // 3 bytes UTF-8, 1 byte UTF-16.
1496                                 // Check for a full string.
1497                                 if ((l + 2) >= len)
1498                                 {
1499                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1500                                         l += 2;
1501                                         break;
1502                                 }
1503
1504                                 // Validate.
1505                                 bool valid = true;
1506                                 u8 l2 = 0;
1507                                 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1508                                 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1509                                 if (!valid)
1510                                 {
1511                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1512                                         l += l2;
1513                                         continue;
1514                                 }
1515
1516                                 // Decode.
1517                                 uchar8_t b1 = ((c2[l] & 0xF) << 4) | ((c2[l+1] >> 2) & 0xF);
1518                                 uchar8_t b2 = ((c2[l+1] & 0x3) << 6) | (c2[l+2] & 0x3F);
1519                                 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1520                                 array[pos++] = ch;
1521                                 l += 3;
1522                         }
1523                         else if ((c2[l] & 0xE0) == 0xC0)
1524                         {       // 2 bytes UTF-8, 1 byte UTF-16.
1525                                 // Check for a full string.
1526                                 if ((l + 1) >= len)
1527                                 {
1528                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1529                                         l += 1;
1530                                         break;
1531                                 }
1532
1533                                 // Validate.
1534                                 if (((c2[l+1] >> 6) & 0x03) != 0x02)
1535                                 {
1536                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1537                                         ++l;
1538                                         continue;
1539                                 }
1540
1541                                 // Decode.
1542                                 uchar8_t b1 = (c2[l] >> 2) & 0x7;
1543                                 uchar8_t b2 = ((c2[l] & 0x3) << 6) | (c2[l+1] & 0x3F);
1544                                 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1545                                 array[pos++] = ch;
1546                                 l += 2;
1547                         }
1548                         else
1549                         {       // 1 byte UTF-8, 1 byte UTF-16.
1550                                 // Validate.
1551                                 if (c2[l] > 0x7F)
1552                                 {       // Values above 0xF4 are restricted and aren't used.  By now, anything above 0x7F is invalid.
1553                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1554                                 }
1555                                 else array[pos++] = static_cast<uchar16_t>(c2[l]);
1556                                 ++l;
1557                         }
1558                 }
1559                 array[used] = 0;
1560
1561                 // Validate our new UTF-16 string.
1562                 validate();
1563
1564                 return *this;
1565         }
1566
1567
1568         //! Appends a UTF-16 string to this ustring16
1569         //! \param other The UTF-16 string to append.
1570         //! \param length The length of the string to append.
1571         //! \return A reference to our current string.
1572         ustring16<TAlloc>& append(const uchar16_t* const other, u32 length=0xffffffff)
1573         {
1574                 if (!other)
1575                         return *this;
1576
1577                 // Determine if the string is long enough for a BOM.
1578                 u32 len = 0;
1579                 const uchar16_t* p = other;
1580                 do
1581                 {
1582                         ++len;
1583                 } while (*p++ && len < unicode::BOM_ENCODE_UTF16_LEN);
1584
1585                 // Check for the BOM to determine the string's endianness.
1586                 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1587                 if (memcmp(other, unicode::BOM_ENCODE_UTF16_LE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1588                         c_end = unicode::EUTFEE_LITTLE;
1589                 else if (memcmp(other, unicode::BOM_ENCODE_UTF16_BE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1590                         c_end = unicode::EUTFEE_BIG;
1591
1592                 // If a BOM was found, don't include it in the string.
1593                 const uchar16_t* c2 = other;
1594                 if (c_end != unicode::EUTFEE_NATIVE)
1595                 {
1596                         c2 = other + unicode::BOM_UTF16_LEN;
1597                         length -= unicode::BOM_UTF16_LEN;
1598                 }
1599
1600                 // Calculate the size of the string to read in.
1601                 len = 0;
1602                 p = c2;
1603                 do
1604                 {
1605                         ++len;
1606                 } while(*p++ && len < length);
1607                 if (len > length)
1608                         len = length;
1609
1610                 // If we need to grow the size of the array, do it now.
1611                 if (used + len >= allocated)
1612                         reallocate(used + (len * 2));
1613                 u32 start = used;
1614                 used += len;
1615
1616                 // Copy the string now.
1617                 unicode::EUTF_ENDIAN m_end = getEndianness();
1618                 for (u32 l = start; l < start + len; ++l)
1619                 {
1620                         array[l] = (uchar16_t)c2[l];
1621                         if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1622                                 array[l] = unicode::swapEndian16(array[l]);
1623                 }
1624
1625                 array[used] = 0;
1626
1627                 // Validate our new UTF-16 string.
1628                 validate();
1629                 return *this;
1630         }
1631
1632
1633         //! Appends a UTF-32 string to this ustring16
1634         //! \param other The UTF-32 string to append.
1635         //! \param length The length of the string to append.
1636         //! \return A reference to our current string.
1637         ustring16<TAlloc>& append(const uchar32_t* const other, u32 length=0xffffffff)
1638         {
1639                 if (!other)
1640                         return *this;
1641
1642                 // Check for the BOM to determine the string's endianness.
1643                 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1644                 if (memcmp(other, unicode::BOM_ENCODE_UTF32_LE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1645                         c_end = unicode::EUTFEE_LITTLE;
1646                 else if (memcmp(other, unicode::BOM_ENCODE_UTF32_BE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1647                         c_end = unicode::EUTFEE_BIG;
1648
1649                 // If a BOM was found, don't include it in the string.
1650                 const uchar32_t* c2 = other;
1651                 if (c_end != unicode::EUTFEE_NATIVE)
1652                 {
1653                         c2 = other + unicode::BOM_UTF32_LEN;
1654                         length -= unicode::BOM_UTF32_LEN;
1655                 }
1656
1657                 // Calculate the size of the string to read in.
1658                 u32 len = 0;
1659                 const uchar32_t* p = c2;
1660                 do
1661                 {
1662                         ++len;
1663                 } while(*p++ && len < length);
1664                 if (len > length)
1665                         len = length;
1666
1667                 // If we need to grow the size of the array, do it now.
1668                 // In case all of the UTF-32 string is split into surrogate pairs, do len * 2.
1669                 if (used + (len * 2) >= allocated)
1670                         reallocate(used + ((len * 2) * 2));
1671                 u32 start = used;
1672
1673                 // Convert UTF-32 to UTF-16.
1674                 unicode::EUTF_ENDIAN m_end = getEndianness();
1675                 u32 pos = start;
1676                 for (u32 l = 0; l<len; ++l)
1677                 {
1678                         ++used;
1679
1680                         uchar32_t ch = c2[l];
1681                         if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1682                                 ch = unicode::swapEndian32(ch);
1683
1684                         if (ch > 0xFFFF)
1685                         {
1686                                 // Split ch up into a surrogate pair as it is over 16 bits long.
1687                                 uchar16_t x = static_cast<uchar16_t>(ch);
1688                                 uchar16_t vh = UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1689                                 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1690                                 array[pos++] = vh;
1691                                 array[pos++] = vl;
1692                                 ++used;         // Using two shorts, so increased used again.
1693                         }
1694                         else if (ch >= 0xD800 && ch <= 0xDFFF)
1695                         {
1696                                 // Between possible UTF-16 surrogates (invalid!)
1697                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1698                         }
1699                         else array[pos++] = static_cast<uchar16_t>(ch);
1700                 }
1701                 array[used] = 0;
1702
1703                 // Validate our new UTF-16 string.
1704                 validate();
1705
1706                 return *this;
1707         }
1708
1709
1710         //! Appends a ustring16 to this ustring16
1711         //! \param other The string to append to this one.
1712         //! \return A reference to our current string.
1713         ustring16<TAlloc>& append(const ustring16<TAlloc>& other)
1714         {
1715                 const uchar16_t* oa = other.c_str();
1716
1717                 u32 len = other.size_raw();
1718
1719                 if (used + len >= allocated)
1720                         reallocate(used + len);
1721
1722                 for (u32 l=0; l<len; ++l)
1723                         array[used+l] = oa[l];
1724
1725                 used += len;
1726                 array[used] = 0;
1727
1728                 return *this;
1729         }
1730
1731
1732         //! Appends a certain amount of characters of a ustring16 to this ustring16.
1733         //! \param other The string to append to this one.
1734         //! \param length How many characters of the other string to add to this one.
1735         //! \return A reference to our current string.
1736         ustring16<TAlloc>& append(const ustring16<TAlloc>& other, u32 length)
1737         {
1738                 if (other.size() == 0)
1739                         return *this;
1740
1741                 if (other.size() < length)
1742                 {
1743                         append(other);
1744                         return *this;
1745                 }
1746
1747                 if (used + length * 2 >= allocated)
1748                         reallocate(used + length * 2);
1749
1750                 const_iterator iter(other, 0);
1751                 u32 l = length;
1752                 while (!iter.atEnd() && l)
1753                 {
1754                         uchar32_t c = *iter;
1755                         append(c);
1756                         ++iter;
1757                         --l;
1758                 }
1759
1760                 return *this;
1761         }
1762
1763
1764         //! Reserves some memory.
1765         //! \param count The amount of characters to reserve.
1766         void reserve(u32 count)
1767         {
1768                 if (count < allocated)
1769                         return;
1770
1771                 reallocate(count);
1772         }
1773
1774
1775         //! Finds first occurrence of character.
1776         //! \param c The character to search for.
1777         //! \return Position where the character has been found, or -1 if not found.
1778         s32 findFirst(uchar32_t c) const
1779         {
1780                 const_iterator i(*this, 0);
1781
1782                 s32 pos = 0;
1783                 while (!i.atEnd())
1784                 {
1785                         uchar32_t t = *i;
1786                         if (c == t)
1787                                 return pos;
1788                         ++pos;
1789                         ++i;
1790                 }
1791
1792                 return -1;
1793         }
1794
1795         //! Finds first occurrence of a character of a list.
1796         //! \param c A list of characters to find. For example if the method should find the first occurrence of 'a' or 'b', this parameter should be "ab".
1797         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1798         //! \return Position where one of the characters has been found, or -1 if not found.
1799         s32 findFirstChar(const uchar32_t* const c, u32 count=1) const
1800         {
1801                 if (!c || !count)
1802                         return -1;
1803
1804                 const_iterator i(*this, 0);
1805
1806                 s32 pos = 0;
1807                 while (!i.atEnd())
1808                 {
1809                         uchar32_t t = *i;
1810                         for (u32 j=0; j<count; ++j)
1811                                 if (t == c[j])
1812                                         return pos;
1813                         ++pos;
1814                         ++i;
1815                 }
1816
1817                 return -1;
1818         }
1819
1820
1821         //! Finds first position of a character not in a given list.
1822         //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1823         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1824         //! \return Position where the character has been found, or -1 if not found.
1825         s32 findFirstCharNotInList(const uchar32_t* const c, u32 count=1) const
1826         {
1827                 if (!c || !count)
1828                         return -1;
1829
1830                 const_iterator i(*this, 0);
1831
1832                 s32 pos = 0;
1833                 while (!i.atEnd())
1834                 {
1835                         uchar32_t t = *i;
1836                         u32 j;
1837                         for (j=0; j<count; ++j)
1838                                 if (t == c[j])
1839                                         break;
1840
1841                         if (j==count)
1842                                 return pos;
1843                         ++pos;
1844                         ++i;
1845                 }
1846
1847                 return -1;
1848         }
1849
1850         //! Finds last position of a character not in a given list.
1851         //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1852         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1853         //! \return Position where the character has been found, or -1 if not found.
1854         s32 findLastCharNotInList(const uchar32_t* const c, u32 count=1) const
1855         {
1856                 if (!c || !count)
1857                         return -1;
1858
1859                 const_iterator i(end());
1860                 --i;
1861
1862                 s32 pos = size() - 1;
1863                 while (!i.atStart())
1864                 {
1865                         uchar32_t t = *i;
1866                         u32 j;
1867                         for (j=0; j<count; ++j)
1868                                 if (t == c[j])
1869                                         break;
1870
1871                         if (j==count)
1872                                 return pos;
1873                         --pos;
1874                         --i;
1875                 }
1876
1877                 return -1;
1878         }
1879
1880         //! Finds next occurrence of character.
1881         //! \param c The character to search for.
1882         //! \param startPos The position in the string to start searching.
1883         //! \return Position where the character has been found, or -1 if not found.
1884         s32 findNext(uchar32_t c, u32 startPos) const
1885         {
1886                 const_iterator i(*this, startPos);
1887
1888                 s32 pos = startPos;
1889                 while (!i.atEnd())
1890                 {
1891                         uchar32_t t = *i;
1892                         if (t == c)
1893                                 return pos;
1894                         ++pos;
1895                         ++i;
1896                 }
1897
1898                 return -1;
1899         }
1900
1901
1902         //! Finds last occurrence of character.
1903         //! \param c The character to search for.
1904         //! \param start The start position of the reverse search ( default = -1, on end ).
1905         //! \return Position where the character has been found, or -1 if not found.
1906         s32 findLast(uchar32_t c, s32 start = -1) const
1907         {
1908                 u32 s = size();
1909                 start = core::clamp ( start < 0 ? (s32)s : start, 0, (s32)s ) - 1;
1910
1911                 const_iterator i(*this, start);
1912                 u32 pos = start;
1913                 while (!i.atStart())
1914                 {
1915                         uchar32_t t = *i;
1916                         if (t == c)
1917                                 return pos;
1918                         --pos;
1919                         --i;
1920                 }
1921
1922                 return -1;
1923         }
1924
1925         //! Finds last occurrence of a character in a list.
1926         //! \param c A list of strings to find. For example if the method should find the last occurrence of 'a' or 'b', this parameter should be "ab".
1927         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1928         //! \return Position where one of the characters has been found, or -1 if not found.
1929         s32 findLastChar(const uchar32_t* const c, u32 count=1) const
1930         {
1931                 if (!c || !count)
1932                         return -1;
1933
1934                 const_iterator i(end());
1935                 --i;
1936
1937                 s32 pos = size();
1938                 while (!i.atStart())
1939                 {
1940                         uchar32_t t = *i;
1941                         for (u32 j=0; j<count; ++j)
1942                                 if (t == c[j])
1943                                         return pos;
1944                         --pos;
1945                         --i;
1946                 }
1947
1948                 return -1;
1949         }
1950
1951
1952         //! Finds another ustring16 in this ustring16.
1953         //! \param str The string to find.
1954         //! \param start The start position of the search.
1955         //! \return Positions where the ustring16 has been found, or -1 if not found.
1956         s32 find(const ustring16<TAlloc>& str, const u32 start = 0) const
1957         {
1958                 u32 my_size = size();
1959                 u32 their_size = str.size();
1960
1961                 if (their_size == 0 || my_size - start < their_size)
1962                         return -1;
1963
1964                 const_iterator i(*this, start);
1965
1966                 s32 pos = start;
1967                 while (!i.atEnd())
1968                 {
1969                         const_iterator i2(i);
1970                         const_iterator j(str, 0);
1971                         uchar32_t t1 = (uchar32_t)*i2;
1972                         uchar32_t t2 = (uchar32_t)*j;
1973                         while (t1 == t2)
1974                         {
1975                                 ++i2;
1976                                 ++j;
1977                                 if (j.atEnd())
1978                                         return pos;
1979                                 t1 = (uchar32_t)*i2;
1980                                 t2 = (uchar32_t)*j;
1981                         }
1982                         ++i;
1983                         ++pos;
1984                 }
1985
1986                 return -1;
1987         }
1988
1989
1990         //! Finds another ustring16 in this ustring16.
1991         //! \param str The string to find.
1992         //! \param start The start position of the search.
1993         //! \return Positions where the string has been found, or -1 if not found.
1994         s32 find_raw(const ustring16<TAlloc>& str, const u32 start = 0) const
1995         {
1996                 const uchar16_t* data = str.c_str();
1997                 if (data && *data)
1998                 {
1999                         u32 len = 0;
2000
2001                         while (data[len])
2002                                 ++len;
2003
2004                         if (len > used)
2005                                 return -1;
2006
2007                         for (u32 i=start; i<=used-len; ++i)
2008                         {
2009                                 u32 j=0;
2010
2011                                 while(data[j] && array[i+j] == data[j])
2012                                         ++j;
2013
2014                                 if (!data[j])
2015                                         return i;
2016                         }
2017                 }
2018
2019                 return -1;
2020         }
2021
2022
2023         //! Returns a substring.
2024         //! \param begin: Start of substring.
2025         //! \param length: Length of substring.
2026         //! \return A reference to our current string.
2027         ustring16<TAlloc> subString(u32 begin, s32 length) const
2028         {
2029                 u32 len = size();
2030                 // if start after ustring16
2031                 // or no proper substring length
2032                 if ((length <= 0) || (begin>=len))
2033                         return ustring16<TAlloc>("");
2034                 // clamp length to maximal value
2035                 if ((length+begin) > len)
2036                         length = len-begin;
2037
2038                 ustring16<TAlloc> o;
2039                 o.reserve((length+1) * 2);
2040
2041                 const_iterator i(*this, begin);
2042                 while (!i.atEnd() && length)
2043                 {
2044                         o.append(*i);
2045                         ++i;
2046                         --length;
2047                 }
2048
2049                 return o;
2050         }
2051
2052
2053         //! Appends a character to this ustring16.
2054         //! \param c Character to append.
2055         //! \return A reference to our current string.
2056         ustring16<TAlloc>& operator += (char c)
2057         {
2058                 append((uchar32_t)c);
2059                 return *this;
2060         }
2061
2062
2063         //! Appends a character to this ustring16.
2064         //! \param c Character to append.
2065         //! \return A reference to our current string.
2066         ustring16<TAlloc>& operator += (uchar32_t c)
2067         {
2068                 append(c);
2069                 return *this;
2070         }
2071
2072
2073         //! Appends a number to this ustring16.
2074         //! \param c Number to append.
2075         //! \return A reference to our current string.
2076         ustring16<TAlloc>& operator += (short c)
2077         {
2078                 append(core::stringc(c));
2079                 return *this;
2080         }
2081
2082
2083         //! Appends a number to this ustring16.
2084         //! \param c Number to append.
2085         //! \return A reference to our current string.
2086         ustring16<TAlloc>& operator += (unsigned short c)
2087         {
2088                 append(core::stringc(c));
2089                 return *this;
2090         }
2091
2092
2093 #ifdef USTRING_CPP0X_NEWLITERALS
2094         //! Appends a number to this ustring16.
2095         //! \param c Number to append.
2096         //! \return A reference to our current string.
2097         ustring16<TAlloc>& operator += (int c)
2098         {
2099                 append(core::stringc(c));
2100                 return *this;
2101         }
2102
2103
2104         //! Appends a number to this ustring16.
2105         //! \param c Number to append.
2106         //! \return A reference to our current string.
2107         ustring16<TAlloc>& operator += (unsigned int c)
2108         {
2109                 append(core::stringc(c));
2110                 return *this;
2111         }
2112 #endif
2113
2114
2115         //! Appends a number to this ustring16.
2116         //! \param c Number to append.
2117         //! \return A reference to our current string.
2118         ustring16<TAlloc>& operator += (long c)
2119         {
2120                 append(core::stringc(c));
2121                 return *this;
2122         }
2123
2124
2125         //! Appends a number to this ustring16.
2126         //! \param c Number to append.
2127         //! \return A reference to our current string.
2128         ustring16<TAlloc>& operator += (unsigned long c)
2129         {
2130                 append(core::stringc(c));
2131                 return *this;
2132         }
2133
2134
2135         //! Appends a number to this ustring16.
2136         //! \param c Number to append.
2137         //! \return A reference to our current string.
2138         ustring16<TAlloc>& operator += (double c)
2139         {
2140                 append(core::stringc(c));
2141                 return *this;
2142         }
2143
2144
2145         //! Appends a char ustring16 to this ustring16.
2146         //! \param c Char ustring16 to append.
2147         //! \return A reference to our current string.
2148         ustring16<TAlloc>& operator += (const uchar16_t* const c)
2149         {
2150                 append(c);
2151                 return *this;
2152         }
2153
2154
2155         //! Appends a ustring16 to this ustring16.
2156         //! \param other ustring16 to append.
2157         //! \return A reference to our current string.
2158         ustring16<TAlloc>& operator += (const ustring16<TAlloc>& other)
2159         {
2160                 append(other);
2161                 return *this;
2162         }
2163
2164
2165         //! Replaces all characters of a given type with another one.
2166         //! \param toReplace Character to replace.
2167         //! \param replaceWith Character replacing the old one.
2168         //! \return A reference to our current string.
2169         ustring16<TAlloc>& replace(uchar32_t toReplace, uchar32_t replaceWith)
2170         {
2171                 iterator i(*this, 0);
2172                 while (!i.atEnd())
2173                 {
2174                         typename ustring16<TAlloc>::access a = *i;
2175                         if ((uchar32_t)a == toReplace)
2176                                 a = replaceWith;
2177                         ++i;
2178                 }
2179                 return *this;
2180         }
2181
2182
2183         //! Replaces all instances of a string with another one.
2184         //! \param toReplace The string to replace.
2185         //! \param replaceWith The string replacing the old one.
2186         //! \return A reference to our current string.
2187         ustring16<TAlloc>& replace(const ustring16<TAlloc>& toReplace, const ustring16<TAlloc>& replaceWith)
2188         {
2189                 if (toReplace.size() == 0)
2190                         return *this;
2191
2192                 const uchar16_t* other = toReplace.c_str();
2193                 const uchar16_t* replace = replaceWith.c_str();
2194                 const u32 other_size = toReplace.size_raw();
2195                 const u32 replace_size = replaceWith.size_raw();
2196
2197                 // Determine the delta.  The algorithm will change depending on the delta.
2198                 s32 delta = replace_size - other_size;
2199
2200                 // A character for character replace.  The string will not shrink or grow.
2201                 if (delta == 0)
2202                 {
2203                         s32 pos = 0;
2204                         while ((pos = find_raw(other, pos)) != -1)
2205                         {
2206                                 for (u32 i = 0; i < replace_size; ++i)
2207                                         array[pos + i] = replace[i];
2208                                 ++pos;
2209                         }
2210                         return *this;
2211                 }
2212
2213                 // We are going to be removing some characters.  The string will shrink.
2214                 if (delta < 0)
2215                 {
2216                         u32 i = 0;
2217                         for (u32 pos = 0; pos <= used; ++i, ++pos)
2218                         {
2219                                 // Is this potentially a match?
2220                                 if (array[pos] == *other)
2221                                 {
2222                                         // Check to see if we have a match.
2223                                         u32 j;
2224                                         for (j = 0; j < other_size; ++j)
2225                                         {
2226                                                 if (array[pos + j] != other[j])
2227                                                         break;
2228                                         }
2229
2230                                         // If we have a match, replace characters.
2231                                         if (j == other_size)
2232                                         {
2233                                                 for (j = 0; j < replace_size; ++j)
2234                                                         array[i + j] = replace[j];
2235                                                 i += replace_size - 1;
2236                                                 pos += other_size - 1;
2237                                                 continue;
2238                                         }
2239                                 }
2240
2241                                 // No match found, just copy characters.
2242                                 array[i - 1] = array[pos];
2243                         }
2244                         array[i] = 0;
2245                         used = i;
2246
2247                         return *this;
2248                 }
2249
2250                 // We are going to be adding characters, so the string size will increase.
2251                 // Count the number of times toReplace exists in the string so we can allocate the new size.
2252                 u32 find_count = 0;
2253                 s32 pos = 0;
2254                 while ((pos = find_raw(other, pos)) != -1)
2255                 {
2256                         ++find_count;
2257                         ++pos;
2258                 }
2259
2260                 // Re-allocate the string now, if needed.
2261                 u32 len = delta * find_count;
2262                 if (used + len >= allocated)
2263                         reallocate(used + len);
2264
2265                 // Start replacing.
2266                 pos = 0;
2267                 while ((pos = find_raw(other, pos)) != -1)
2268                 {
2269                         uchar16_t* start = array + pos + other_size - 1;
2270                         uchar16_t* ptr   = array + used;
2271                         uchar16_t* end   = array + used + delta;
2272
2273                         // Shift characters to make room for the string.
2274                         while (ptr != start)
2275                         {
2276                                 *end = *ptr;
2277                                 --ptr;
2278                                 --end;
2279                         }
2280
2281                         // Add the new string now.
2282                         for (u32 i = 0; i < replace_size; ++i)
2283                                 array[pos + i] = replace[i];
2284
2285                         pos += replace_size;
2286                         used += delta;
2287                 }
2288
2289                 // Terminate the string and return ourself.
2290                 array[used] = 0;
2291                 return *this;
2292         }
2293
2294
2295         //! Removes characters from a ustring16..
2296         //! \param c The character to remove.
2297         //! \return A reference to our current string.
2298         ustring16<TAlloc>& remove(uchar32_t c)
2299         {
2300                 u32 pos = 0;
2301                 u32 found = 0;
2302                 u32 len = (c > 0xFFFF ? 2 : 1);         // Remove characters equal to the size of c as a UTF-16 character.
2303                 for (u32 i=0; i<=used; ++i)
2304                 {
2305                         uchar32_t uc32 = 0;
2306                         if (!UTF16_IS_SURROGATE_HI(array[i]))
2307                                 uc32 |= array[i];
2308                         else if (i + 1 <= used)
2309                         {
2310                                 // Convert the surrogate pair into a single UTF-32 character.
2311                                 uc32 = unicode::toUTF32(array[i], array[i + 1]);
2312                         }
2313                         u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2314
2315                         if (uc32 == c)
2316                         {
2317                                 found += len;
2318                                 continue;
2319                         }
2320
2321                         array[pos++] = array[i];
2322                         if (len2 == 2)
2323                                 array[pos++] = array[++i];
2324                 }
2325                 used -= found;
2326                 array[used] = 0;
2327                 return *this;
2328         }
2329
2330
2331         //! Removes a ustring16 from the ustring16.
2332         //! \param toRemove The string to remove.
2333         //! \return A reference to our current string.
2334         ustring16<TAlloc>& remove(const ustring16<TAlloc>& toRemove)
2335         {
2336                 u32 size = toRemove.size_raw();
2337                 if (size == 0) return *this;
2338
2339                 const uchar16_t* tra = toRemove.c_str();
2340                 u32 pos = 0;
2341                 u32 found = 0;
2342                 for (u32 i=0; i<=used; ++i)
2343                 {
2344                         u32 j = 0;
2345                         while (j < size)
2346                         {
2347                                 if (array[i + j] != tra[j])
2348                                         break;
2349                                 ++j;
2350                         }
2351                         if (j == size)
2352                         {
2353                                 found += size;
2354                                 i += size - 1;
2355                                 continue;
2356                         }
2357
2358                         array[pos++] = array[i];
2359                 }
2360                 used -= found;
2361                 array[used] = 0;
2362                 return *this;
2363         }
2364
2365
2366         //! Removes characters from the ustring16.
2367         //! \param characters The characters to remove.
2368         //! \return A reference to our current string.
2369         ustring16<TAlloc>& removeChars(const ustring16<TAlloc>& characters)
2370         {
2371                 if (characters.size_raw() == 0)
2372                         return *this;
2373
2374                 u32 pos = 0;
2375                 u32 found = 0;
2376                 const_iterator iter(characters);
2377                 for (u32 i=0; i<=used; ++i)
2378                 {
2379                         uchar32_t uc32 = 0;
2380                         if (!UTF16_IS_SURROGATE_HI(array[i]))
2381                                 uc32 |= array[i];
2382                         else if (i + 1 <= used)
2383                         {
2384                                 // Convert the surrogate pair into a single UTF-32 character.
2385                                 uc32 = unicode::toUTF32(array[i], array[i+1]);
2386                         }
2387                         u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2388
2389                         bool cont = false;
2390                         iter.toStart();
2391                         while (!iter.atEnd())
2392                         {
2393                                 uchar32_t c = *iter;
2394                                 if (uc32 == c)
2395                                 {
2396                                         found += (c > 0xFFFF ? 2 : 1);          // Remove characters equal to the size of c as a UTF-16 character.
2397                                         ++i;
2398                                         cont = true;
2399                                         break;
2400                                 }
2401                                 ++iter;
2402                         }
2403                         if (cont) continue;
2404
2405                         array[pos++] = array[i];
2406                         if (len2 == 2)
2407                                 array[pos++] = array[++i];
2408                 }
2409                 used -= found;
2410                 array[used] = 0;
2411                 return *this;
2412         }
2413
2414
2415         //! Trims the ustring16.
2416         //! Removes the specified characters (by default, Latin-1 whitespace) from the begining and the end of the ustring16.
2417         //! \param whitespace The characters that are to be considered as whitespace.
2418         //! \return A reference to our current string.
2419         ustring16<TAlloc>& trim(const ustring16<TAlloc>& whitespace = " \t\n\r")
2420         {
2421                 core::array<uchar32_t> utf32white = whitespace.toUTF32();
2422
2423                 // find start and end of the substring without the specified characters
2424                 const s32 begin = findFirstCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2425                 if (begin == -1)
2426                         return (*this="");
2427
2428                 const s32 end = findLastCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2429
2430                 return (*this = subString(begin, (end +1) - begin));
2431         }
2432
2433
2434         //! Erases a character from the ustring16.
2435         //! May be slow, because all elements following after the erased element have to be copied.
2436         //! \param index Index of element to be erased.
2437         //! \return A reference to our current string.
2438         ustring16<TAlloc>& erase(u32 index)
2439         {
2440                 _IRR_DEBUG_BREAK_IF(index>used) // access violation
2441
2442                 iterator i(*this, index);
2443
2444                 uchar32_t t = *i;
2445                 u32 len = (t > 0xFFFF ? 2 : 1);
2446
2447                 for (u32 j = static_cast<u32>(i.getPos()) + len; j <= used; ++j)
2448                         array[j - len] = array[j];
2449
2450                 used -= len;
2451                 array[used] = 0;
2452
2453                 return *this;
2454         }
2455
2456
2457         //! Validate the existing ustring16, checking for valid surrogate pairs and checking for proper termination.
2458         //! \return A reference to our current string.
2459         ustring16<TAlloc>& validate()
2460         {
2461                 // Validate all unicode characters.
2462                 for (u32 i=0; i<allocated; ++i)
2463                 {
2464                         // Terminate on existing null.
2465                         if (array[i] == 0)
2466                         {
2467                                 used = i;
2468                                 return *this;
2469                         }
2470                         if (UTF16_IS_SURROGATE(array[i]))
2471                         {
2472                                 if (((i+1) >= allocated) || UTF16_IS_SURROGATE_LO(array[i]))
2473                                         array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2474                                 else if (UTF16_IS_SURROGATE_HI(array[i]) && !UTF16_IS_SURROGATE_LO(array[i+1]))
2475                                         array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2476                                 ++i;
2477                         }
2478                         if (array[i] >= 0xFDD0 && array[i] <= 0xFDEF)
2479                                 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2480                 }
2481
2482                 // terminate
2483                 used = 0;
2484                 if (allocated > 0)
2485                 {
2486                         used = allocated - 1;
2487                         array[used] = 0;
2488                 }
2489                 return *this;
2490         }
2491
2492
2493         //! Gets the last char of the ustring16, or 0.
2494         //! \return The last char of the ustring16, or 0.
2495         uchar32_t lastChar() const
2496         {
2497                 if (used < 1)
2498                         return 0;
2499
2500                 if (UTF16_IS_SURROGATE_LO(array[used-1]))
2501                 {
2502                         // Make sure we have a paired surrogate.
2503                         if (used < 2)
2504                                 return 0;
2505
2506                         // Check for an invalid surrogate.
2507                         if (!UTF16_IS_SURROGATE_HI(array[used-2]))
2508                                 return 0;
2509
2510                         // Convert the surrogate pair into a single UTF-32 character.
2511                         return unicode::toUTF32(array[used-2], array[used-1]);
2512                 }
2513                 else
2514                 {
2515                         return array[used-1];
2516                 }
2517         }
2518
2519
2520         //! Split the ustring16 into parts.
2521         /** This method will split a ustring16 at certain delimiter characters
2522         into the container passed in as reference. The type of the container
2523         has to be given as template parameter. It must provide a push_back and
2524         a size method.
2525         \param ret The result container
2526         \param c C-style ustring16 of delimiter characters
2527         \param count Number of delimiter characters
2528         \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2529         container. If two delimiters occur without a character in between, an
2530         empty substring would be placed in the result. If this flag is set,
2531         only non-empty strings are stored.
2532         \param keepSeparators Flag which allows to add the separator to the
2533         result ustring16. If this flag is true, the concatenation of the
2534         substrings results in the original ustring16. Otherwise, only the
2535         characters between the delimiters are returned.
2536         \return The number of resulting substrings
2537         */
2538         template<class container>
2539         u32 split(container& ret, const uchar32_t* const c, u32 count=1, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2540         {
2541                 if (!c)
2542                         return 0;
2543
2544                 const_iterator i(*this);
2545                 const u32 oldSize=ret.size();
2546                 u32 pos = 0;
2547                 u32 lastpos = 0;
2548                 u32 lastpospos = 0;
2549                 bool lastWasSeparator = false;
2550                 while (!i.atEnd())
2551                 {
2552                         uchar32_t ch = *i;
2553                         bool foundSeparator = false;
2554                         for (u32 j=0; j<count; ++j)
2555                         {
2556                                 if (ch == c[j])
2557                                 {
2558                                         if ((!ignoreEmptyTokens || pos - lastpos != 0) &&
2559                                                         !lastWasSeparator)
2560                                         ret.push_back(ustring16<TAlloc>(&array[lastpospos], pos - lastpos));
2561                                         foundSeparator = true;
2562                                         lastpos = (keepSeparators ? pos : pos + 1);
2563                                         lastpospos = (keepSeparators ? i.getPos() : i.getPos() + 1);
2564                                         break;
2565                                 }
2566                         }
2567                         lastWasSeparator = foundSeparator;
2568                         ++pos;
2569                         ++i;
2570                 }
2571                 u32 s = size() + 1;
2572                 if (s > lastpos)
2573                         ret.push_back(ustring16<TAlloc>(&array[lastpospos], s - lastpos));
2574                 return ret.size()-oldSize;
2575         }
2576
2577
2578         //! Split the ustring16 into parts.
2579         /** This method will split a ustring16 at certain delimiter characters
2580         into the container passed in as reference. The type of the container
2581         has to be given as template parameter. It must provide a push_back and
2582         a size method.
2583         \param ret The result container
2584         \param c A unicode string of delimiter characters
2585         \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2586         container. If two delimiters occur without a character in between, an
2587         empty substring would be placed in the result. If this flag is set,
2588         only non-empty strings are stored.
2589         \param keepSeparators Flag which allows to add the separator to the
2590         result ustring16. If this flag is true, the concatenation of the
2591         substrings results in the original ustring16. Otherwise, only the
2592         characters between the delimiters are returned.
2593         \return The number of resulting substrings
2594         */
2595         template<class container>
2596         u32 split(container& ret, const ustring16<TAlloc>& c, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2597         {
2598                 core::array<uchar32_t> v = c.toUTF32();
2599                 return split(ret, v.pointer(), v.size(), ignoreEmptyTokens, keepSeparators);
2600         }
2601
2602
2603         //! Gets the size of the allocated memory buffer for the string.
2604         //! \return The size of the allocated memory buffer.
2605         u32 capacity() const
2606         {
2607                 return allocated;
2608         }
2609
2610
2611         //! Returns the raw number of UTF-16 code points in the string which includes the individual surrogates.
2612         //! \return The raw number of UTF-16 code points, excluding the trialing NUL.
2613         u32 size_raw() const
2614         {
2615                 return used;
2616         }
2617
2618
2619         //! Inserts a character into the string.
2620         //! \param c The character to insert.
2621         //! \param pos The position to insert the character.
2622         //! \return A reference to our current string.
2623         ustring16<TAlloc>& insert(uchar32_t c, u32 pos)
2624         {
2625                 u8 len = (c > 0xFFFF ? 2 : 1);
2626
2627                 if (used + len >= allocated)
2628                         reallocate(used + len);
2629
2630                 used += len;
2631
2632                 iterator iter(*this, pos);
2633                 for (u32 i = used - 2; i > iter.getPos(); --i)
2634                         array[i] = array[i - len];
2635
2636                 if (c > 0xFFFF)
2637                 {
2638                         // c will be multibyte, so split it up into a surrogate pair.
2639                         uchar16_t x = static_cast<uchar16_t>(c);
2640                         uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
2641                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
2642                         array[iter.getPos()] = vh;
2643                         array[iter.getPos()+1] = vl;
2644                 }
2645                 else
2646                 {
2647                         array[iter.getPos()] = static_cast<uchar16_t>(c);
2648                 }
2649                 array[used] = 0;
2650                 return *this;
2651         }
2652
2653
2654         //! Inserts a string into the string.
2655         //! \param c The string to insert.
2656         //! \param pos The position to insert the string.
2657         //! \return A reference to our current string.
2658         ustring16<TAlloc>& insert(const ustring16<TAlloc>& c, u32 pos)
2659         {
2660                 u32 len = c.size_raw();
2661                 if (len == 0) return *this;
2662
2663                 if (used + len >= allocated)
2664                         reallocate(used + len);
2665
2666                 used += len;
2667
2668                 iterator iter(*this, pos);
2669                 for (u32 i = used - 2; i > iter.getPos() + len; --i)
2670                         array[i] = array[i - len];
2671
2672                 const uchar16_t* s = c.c_str();
2673                 for (u32 i = 0; i < len; ++i)
2674                 {
2675                         array[pos++] = *s;
2676                         ++s;
2677                 }
2678
2679                 array[used] = 0;
2680                 return *this;
2681         }
2682
2683
2684         //! Inserts a character into the string.
2685         //! \param c The character to insert.
2686         //! \param pos The position to insert the character.
2687         //! \return A reference to our current string.
2688         ustring16<TAlloc>& insert_raw(uchar16_t c, u32 pos)
2689         {
2690                 if (used + 1 >= allocated)
2691                         reallocate(used + 1);
2692
2693                 ++used;
2694
2695                 for (u32 i = used - 1; i > pos; --i)
2696                         array[i] = array[i - 1];
2697
2698                 array[pos] = c;
2699                 array[used] = 0;
2700                 return *this;
2701         }
2702
2703
2704         //! Removes a character from string.
2705         //! \param pos Position of the character to remove.
2706         //! \return A reference to our current string.
2707         ustring16<TAlloc>& erase_raw(u32 pos)
2708         {
2709                 for (u32 i=pos; i<=used; ++i)
2710                 {
2711                         array[i] = array[i + 1];
2712                 }
2713                 --used;
2714                 array[used] = 0;
2715                 return *this;
2716         }
2717
2718
2719         //! Replaces a character in the string.
2720         //! \param c The new character.
2721         //! \param pos The position of the character to replace.
2722         //! \return A reference to our current string.
2723         ustring16<TAlloc>& replace_raw(uchar16_t c, u32 pos)
2724         {
2725                 array[pos] = c;
2726                 return *this;
2727         }
2728
2729
2730         //! Returns an iterator to the beginning of the string.
2731         //! \return An iterator to the beginning of the string.
2732         iterator begin()
2733         {
2734                 iterator i(*this, 0);
2735                 return i;
2736         }
2737
2738
2739         //! Returns an iterator to the beginning of the string.
2740         //! \return An iterator to the beginning of the string.
2741         const_iterator begin() const
2742         {
2743                 const_iterator i(*this, 0);
2744                 return i;
2745         }
2746
2747
2748         //! Returns an iterator to the beginning of the string.
2749         //! \return An iterator to the beginning of the string.
2750         const_iterator cbegin() const
2751         {
2752                 const_iterator i(*this, 0);
2753                 return i;
2754         }
2755
2756
2757         //! Returns an iterator to the end of the string.
2758         //! \return An iterator to the end of the string.
2759         iterator end()
2760         {
2761                 iterator i(*this, 0);
2762                 i.toEnd();
2763                 return i;
2764         }
2765
2766
2767         //! Returns an iterator to the end of the string.
2768         //! \return An iterator to the end of the string.
2769         const_iterator end() const
2770         {
2771                 const_iterator i(*this, 0);
2772                 i.toEnd();
2773                 return i;
2774         }
2775
2776
2777         //! Returns an iterator to the end of the string.
2778         //! \return An iterator to the end of the string.
2779         const_iterator cend() const
2780         {
2781                 const_iterator i(*this, 0);
2782                 i.toEnd();
2783                 return i;
2784         }
2785
2786
2787         //! Converts the string to a UTF-8 encoded string.
2788         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2789         //! \return A string containing the UTF-8 encoded string.
2790         core::string<uchar8_t> toUTF8_s(const bool addBOM = false) const
2791         {
2792                 core::string<uchar8_t> ret;
2793                 ret.reserve(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2794                 const_iterator iter(*this, 0);
2795
2796                 // Add the byte order mark if the user wants it.
2797                 if (addBOM)
2798                 {
2799                         ret.append(unicode::BOM_ENCODE_UTF8[0]);
2800                         ret.append(unicode::BOM_ENCODE_UTF8[1]);
2801                         ret.append(unicode::BOM_ENCODE_UTF8[2]);
2802                 }
2803
2804                 while (!iter.atEnd())
2805                 {
2806                         uchar32_t c = *iter;
2807                         if (c > 0xFFFF)
2808                         {       // 4 bytes
2809                                 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2810                                 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2811                                 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2812                                 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2813                                 ret.append(b1);
2814                                 ret.append(b2);
2815                                 ret.append(b3);
2816                                 ret.append(b4);
2817                         }
2818                         else if (c > 0x7FF)
2819                         {       // 3 bytes
2820                                 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2821                                 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2822                                 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2823                                 ret.append(b1);
2824                                 ret.append(b2);
2825                                 ret.append(b3);
2826                         }
2827                         else if (c > 0x7F)
2828                         {       // 2 bytes
2829                                 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2830                                 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2831                                 ret.append(b1);
2832                                 ret.append(b2);
2833                         }
2834                         else
2835                         {       // 1 byte
2836                                 ret.append(static_cast<uchar8_t>(c));
2837                         }
2838                         ++iter;
2839                 }
2840                 return ret;
2841         }
2842
2843
2844         //! Converts the string to a UTF-8 encoded string array.
2845         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2846         //! \return An array containing the UTF-8 encoded string.
2847         core::array<uchar8_t> toUTF8(const bool addBOM = false) const
2848         {
2849                 core::array<uchar8_t> ret(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2850                 const_iterator iter(*this, 0);
2851
2852                 // Add the byte order mark if the user wants it.
2853                 if (addBOM)
2854                 {
2855                         ret.push_back(unicode::BOM_ENCODE_UTF8[0]);
2856                         ret.push_back(unicode::BOM_ENCODE_UTF8[1]);
2857                         ret.push_back(unicode::BOM_ENCODE_UTF8[2]);
2858                 }
2859
2860                 while (!iter.atEnd())
2861                 {
2862                         uchar32_t c = *iter;
2863                         if (c > 0xFFFF)
2864                         {       // 4 bytes
2865                                 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2866                                 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2867                                 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2868                                 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2869                                 ret.push_back(b1);
2870                                 ret.push_back(b2);
2871                                 ret.push_back(b3);
2872                                 ret.push_back(b4);
2873                         }
2874                         else if (c > 0x7FF)
2875                         {       // 3 bytes
2876                                 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2877                                 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2878                                 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2879                                 ret.push_back(b1);
2880                                 ret.push_back(b2);
2881                                 ret.push_back(b3);
2882                         }
2883                         else if (c > 0x7F)
2884                         {       // 2 bytes
2885                                 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2886                                 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2887                                 ret.push_back(b1);
2888                                 ret.push_back(b2);
2889                         }
2890                         else
2891                         {       // 1 byte
2892                                 ret.push_back(static_cast<uchar8_t>(c));
2893                         }
2894                         ++iter;
2895                 }
2896                 ret.push_back(0);
2897                 return ret;
2898         }
2899
2900
2901 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
2902         //! Converts the string to a UTF-16 encoded string.
2903         //! \param endian The desired endianness of the string.
2904         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2905         //! \return A string containing the UTF-16 encoded string.
2906         core::string<char16_t> toUTF16_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2907         {
2908                 core::string<char16_t> ret;
2909                 ret.reserve(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2910
2911                 // Add the BOM if specified.
2912                 if (addBOM)
2913                 {
2914                         if (endian == unicode::EUTFEE_NATIVE)
2915                                 ret[0] = unicode::BOM;
2916                         else if (endian == unicode::EUTFEE_LITTLE)
2917                         {
2918                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ret.c_str());
2919                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2920                                 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2921                         }
2922                         else
2923                         {
2924                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ret.c_str());
2925                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2926                                 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2927                         }
2928                 }
2929
2930                 ret.append(array);
2931                 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2932                 {
2933                         char16_t* ptr = ret.c_str();
2934                         for (u32 i = 0; i < ret.size(); ++i)
2935                                 *ptr++ = unicode::swapEndian16(*ptr);
2936                 }
2937                 return ret;
2938         }
2939 #endif
2940
2941
2942         //! Converts the string to a UTF-16 encoded string array.
2943         //! Unfortunately, no toUTF16_s() version exists due to limitations with Irrlicht's string class.
2944         //! \param endian The desired endianness of the string.
2945         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2946         //! \return An array containing the UTF-16 encoded string.
2947         core::array<uchar16_t> toUTF16(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2948         {
2949                 core::array<uchar16_t> ret(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2950                 uchar16_t* ptr = ret.pointer();
2951
2952                 // Add the BOM if specified.
2953                 if (addBOM)
2954                 {
2955                         if (endian == unicode::EUTFEE_NATIVE)
2956                                 *ptr = unicode::BOM;
2957                         else if (endian == unicode::EUTFEE_LITTLE)
2958                         {
2959                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2960                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2961                                 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2962                         }
2963                         else
2964                         {
2965                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2966                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2967                                 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2968                         }
2969                         ++ptr;
2970                 }
2971
2972                 memcpy((void*)ptr, (void*)array, used * sizeof(uchar16_t));
2973                 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2974                 {
2975                         for (u32 i = 0; i <= used; ++i)
2976                                 ptr[i] = unicode::swapEndian16(ptr[i]);
2977                 }
2978                 ret.set_used(used + (addBOM ? unicode::BOM_UTF16_LEN : 0));
2979                 ret.push_back(0);
2980                 return ret;
2981         }
2982
2983
2984 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
2985         //! Converts the string to a UTF-32 encoded string.
2986         //! \param endian The desired endianness of the string.
2987         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2988         //! \return A string containing the UTF-32 encoded string.
2989         core::string<char32_t> toUTF32_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2990         {
2991                 core::string<char32_t> ret;
2992                 ret.reserve(size() + 1 + (addBOM ? unicode::BOM_UTF32_LEN : 0));
2993                 const_iterator iter(*this, 0);
2994
2995                 // Add the BOM if specified.
2996                 if (addBOM)
2997                 {
2998                         if (endian == unicode::EUTFEE_NATIVE)
2999                                 ret.append(unicode::BOM);
3000                         else
3001                         {
3002                                 union
3003                                 {
3004                                         uchar32_t full;
3005                                         u8 chunk[4];
3006                                 } t;
3007
3008                                 if (endian == unicode::EUTFEE_LITTLE)
3009                                 {
3010                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
3011                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
3012                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
3013                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
3014                                 }
3015                                 else
3016                                 {
3017                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
3018                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
3019                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
3020                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
3021                                 }
3022                                 ret.append(t.full);
3023                         }
3024                 }
3025
3026                 while (!iter.atEnd())
3027                 {
3028                         uchar32_t c = *iter;
3029                         if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
3030                                 c = unicode::swapEndian32(c);
3031                         ret.append(c);
3032                         ++iter;
3033                 }
3034                 return ret;
3035         }
3036 #endif
3037
3038
3039         //! Converts the string to a UTF-32 encoded string array.
3040         //! Unfortunately, no toUTF32_s() version exists due to limitations with Irrlicht's string class.
3041         //! \param endian The desired endianness of the string.
3042         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3043         //! \return An array containing the UTF-32 encoded string.
3044         core::array<uchar32_t> toUTF32(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3045         {
3046                 core::array<uchar32_t> ret(size() + (addBOM ? unicode::BOM_UTF32_LEN : 0) + 1);
3047                 const_iterator iter(*this, 0);
3048
3049                 // Add the BOM if specified.
3050                 if (addBOM)
3051                 {
3052                         if (endian == unicode::EUTFEE_NATIVE)
3053                                 ret.push_back(unicode::BOM);
3054                         else
3055                         {
3056                                 union
3057                                 {
3058                                         uchar32_t full;
3059                                         u8 chunk[4];
3060                                 } t;
3061
3062                                 if (endian == unicode::EUTFEE_LITTLE)
3063                                 {
3064                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
3065                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
3066                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
3067                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
3068                                 }
3069                                 else
3070                                 {
3071                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
3072                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
3073                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
3074                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
3075                                 }
3076                                 ret.push_back(t.full);
3077                         }
3078                 }
3079                 ret.push_back(0);
3080
3081                 while (!iter.atEnd())
3082                 {
3083                         uchar32_t c = *iter;
3084                         if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
3085                                 c = unicode::swapEndian32(c);
3086                         ret.push_back(c);
3087                         ++iter;
3088                 }
3089                 return ret;
3090         }
3091
3092
3093         //! Converts the string to a wchar_t encoded string.
3094         /** The size of a wchar_t changes depending on the platform.  This function will store a
3095         correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
3096         //! \param endian The desired endianness of the string.
3097         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3098         //! \return A string containing the wchar_t encoded string.
3099         core::string<wchar_t> toWCHAR_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3100         {
3101                 if (sizeof(wchar_t) == 4)
3102                 {
3103                         core::array<uchar32_t> a(toUTF32(endian, addBOM));
3104                         core::stringw ret(a.pointer());
3105                         return ret;
3106                 }
3107                 else if (sizeof(wchar_t) == 2)
3108                 {
3109                         if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
3110                         {
3111                                 core::stringw ret(array);
3112                                 return ret;
3113                         }
3114                         else
3115                         {
3116                                 core::array<uchar16_t> a(toUTF16(endian, addBOM));
3117                                 core::stringw ret(a.pointer());
3118                                 return ret;
3119                         }
3120                 }
3121                 else if (sizeof(wchar_t) == 1)
3122                 {
3123                         core::array<uchar8_t> a(toUTF8(addBOM));
3124                         core::stringw ret(a.pointer());
3125                         return ret;
3126                 }
3127
3128                 // Shouldn't happen.
3129                 return core::stringw();
3130         }
3131
3132
3133         //! Converts the string to a wchar_t encoded string array.
3134         /** The size of a wchar_t changes depending on the platform.  This function will store a
3135         correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
3136         //! \param endian The desired endianness of the string.
3137         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3138         //! \return An array containing the wchar_t encoded string.
3139         core::array<wchar_t> toWCHAR(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3140         {
3141                 if (sizeof(wchar_t) == 4)
3142                 {
3143                         core::array<uchar32_t> a(toUTF32(endian, addBOM));
3144                         core::array<wchar_t> ret(a.size());
3145                         ret.set_used(a.size());
3146                         memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar32_t));
3147                         return ret;
3148                 }
3149                 if (sizeof(wchar_t) == 2)
3150                 {
3151                         if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
3152                         {
3153                                 core::array<wchar_t> ret(used);
3154                                 ret.set_used(used);
3155                                 memcpy((void*)ret.pointer(), (void*)array, used * sizeof(uchar16_t));
3156                                 return ret;
3157                         }
3158                         else
3159                         {
3160                                 core::array<uchar16_t> a(toUTF16(endian, addBOM));
3161                                 core::array<wchar_t> ret(a.size());
3162                                 ret.set_used(a.size());
3163                                 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar16_t));
3164                                 return ret;
3165                         }
3166                 }
3167                 if (sizeof(wchar_t) == 1)
3168                 {
3169                         core::array<uchar8_t> a(toUTF8(addBOM));
3170                         core::array<wchar_t> ret(a.size());
3171                         ret.set_used(a.size());
3172                         memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar8_t));
3173                         return ret;
3174                 }
3175
3176                 // Shouldn't happen.
3177                 return core::array<wchar_t>();
3178         }
3179
3180         //! Converts the string to a properly encoded io::path string.
3181         //! \param endian The desired endianness of the string.
3182         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3183         //! \return An io::path string containing the properly encoded string.
3184         io::path toPATH_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3185         {
3186 #if defined(_IRR_WCHAR_FILESYSTEM)
3187                 return toWCHAR_s(endian, addBOM);
3188 #else
3189                 return toUTF8_s(addBOM);
3190 #endif
3191         }
3192
3193         //! Loads an unknown stream of data.
3194         //! Will attempt to determine if the stream is unicode data.  Useful for loading from files.
3195         //! \param data The data stream to load from.
3196         //! \param data_size The length of the data string.
3197         //! \return A reference to our current string.
3198         ustring16<TAlloc>& loadDataStream(const char* data, size_t data_size)
3199         {
3200                 // Clear our string.
3201                 *this = "";
3202                 if (!data)
3203                         return *this;
3204
3205                 unicode::EUTF_ENCODE e = unicode::determineUnicodeBOM(data);
3206                 switch (e)
3207                 {
3208                         default:
3209                         case unicode::EUTFE_UTF8:
3210                                 append((uchar8_t*)data, data_size);
3211                                 break;
3212
3213                         case unicode::EUTFE_UTF16:
3214                         case unicode::EUTFE_UTF16_BE:
3215                         case unicode::EUTFE_UTF16_LE:
3216                                 append((uchar16_t*)data, data_size / 2);
3217                                 break;
3218
3219                         case unicode::EUTFE_UTF32:
3220                         case unicode::EUTFE_UTF32_BE:
3221                         case unicode::EUTFE_UTF32_LE:
3222                                 append((uchar32_t*)data, data_size / 4);
3223                                 break;
3224                 }
3225
3226                 return *this;
3227         }
3228
3229         //! Gets the encoding of the Unicode string this class contains.
3230         //! \return An enum describing the current encoding of this string.
3231         const unicode::EUTF_ENCODE getEncoding() const
3232         {
3233                 return encoding;
3234         }
3235
3236         //! Gets the endianness of the Unicode string this class contains.
3237         //! \return An enum describing the endianness of this string.
3238         const unicode::EUTF_ENDIAN getEndianness() const
3239         {
3240                 if (encoding == unicode::EUTFE_UTF16_LE ||
3241                         encoding == unicode::EUTFE_UTF32_LE)
3242                         return unicode::EUTFEE_LITTLE;
3243                 else return unicode::EUTFEE_BIG;
3244         }
3245
3246 private:
3247
3248         //! Reallocate the string, making it bigger or smaller.
3249         //! \param new_size The new size of the string.
3250         void reallocate(u32 new_size)
3251         {
3252                 uchar16_t* old_array = array;
3253
3254                 array = allocator.allocate(new_size + 1); //new u16[new_size];
3255                 allocated = new_size + 1;
3256                 if (old_array == 0) return;
3257
3258                 u32 amount = used < new_size ? used : new_size;
3259                 for (u32 i=0; i<=amount; ++i)
3260                         array[i] = old_array[i];
3261
3262                 if (allocated <= used)
3263                         used = allocated - 1;
3264
3265                 array[used] = 0;
3266
3267                 allocator.deallocate(old_array); // delete [] old_array;
3268         }
3269
3270         //--- member variables
3271
3272         uchar16_t* array;
3273         unicode::EUTF_ENCODE encoding;
3274         u32 allocated;
3275         u32 used;
3276         TAlloc allocator;
3277         //irrAllocator<uchar16_t> allocator;
3278 };
3279
3280 typedef ustring16<irrAllocator<uchar16_t> > ustring;
3281
3282
3283 //! Appends two ustring16s.
3284 template <typename TAlloc>
3285 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const ustring16<TAlloc>& right)
3286 {
3287         ustring16<TAlloc> ret(left);
3288         ret += right;
3289         return ret;
3290 }
3291
3292
3293 //! Appends a ustring16 and a null-terminated unicode string.
3294 template <typename TAlloc, class B>
3295 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const B* const right)
3296 {
3297         ustring16<TAlloc> ret(left);
3298         ret += right;
3299         return ret;
3300 }
3301
3302
3303 //! Appends a ustring16 and a null-terminated unicode string.
3304 template <class B, typename TAlloc>
3305 inline ustring16<TAlloc> operator+(const B* const left, const ustring16<TAlloc>& right)
3306 {
3307         ustring16<TAlloc> ret(left);
3308         ret += right;
3309         return ret;
3310 }
3311
3312
3313 //! Appends a ustring16 and an Irrlicht string.
3314 template <typename TAlloc, typename B, typename BAlloc>
3315 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const string<B, BAlloc>& right)
3316 {
3317         ustring16<TAlloc> ret(left);
3318         ret += right;
3319         return ret;
3320 }
3321
3322
3323 //! Appends a ustring16 and an Irrlicht string.
3324 template <typename TAlloc, typename B, typename BAlloc>
3325 inline ustring16<TAlloc> operator+(const string<B, BAlloc>& left, const ustring16<TAlloc>& right)
3326 {
3327         ustring16<TAlloc> ret(left);
3328         ret += right;
3329         return ret;
3330 }
3331
3332
3333 //! Appends a ustring16 and a std::basic_string.
3334 template <typename TAlloc, typename B, typename A, typename BAlloc>
3335 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const std::basic_string<B, A, BAlloc>& right)
3336 {
3337         ustring16<TAlloc> ret(left);
3338         ret += right;
3339         return ret;
3340 }
3341
3342
3343 //! Appends a ustring16 and a std::basic_string.
3344 template <typename TAlloc, typename B, typename A, typename BAlloc>
3345 inline ustring16<TAlloc> operator+(const std::basic_string<B, A, BAlloc>& left, const ustring16<TAlloc>& right)
3346 {
3347         ustring16<TAlloc> ret(left);
3348         ret += right;
3349         return ret;
3350 }
3351
3352
3353 //! Appends a ustring16 and a char.
3354 template <typename TAlloc>
3355 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const char right)
3356 {
3357         ustring16<TAlloc> ret(left);
3358         ret += right;
3359         return ret;
3360 }
3361
3362
3363 //! Appends a ustring16 and a char.
3364 template <typename TAlloc>
3365 inline ustring16<TAlloc> operator+(const char left, const ustring16<TAlloc>& right)
3366 {
3367         ustring16<TAlloc> ret(left);
3368         ret += right;
3369         return ret;
3370 }
3371
3372
3373 #ifdef USTRING_CPP0X_NEWLITERALS
3374 //! Appends a ustring16 and a uchar32_t.
3375 template <typename TAlloc>
3376 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const uchar32_t right)
3377 {
3378         ustring16<TAlloc> ret(left);
3379         ret += right;
3380         return ret;
3381 }
3382
3383
3384 //! Appends a ustring16 and a uchar32_t.
3385 template <typename TAlloc>
3386 inline ustring16<TAlloc> operator+(const uchar32_t left, const ustring16<TAlloc>& right)
3387 {
3388         ustring16<TAlloc> ret(left);
3389         ret += right;
3390         return ret;
3391 }
3392 #endif
3393
3394
3395 //! Appends a ustring16 and a short.
3396 template <typename TAlloc>
3397 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const short right)
3398 {
3399         ustring16<TAlloc> ret(left);
3400         ret += core::stringc(right);
3401         return ret;
3402 }
3403
3404
3405 //! Appends a ustring16 and a short.
3406 template <typename TAlloc>
3407 inline ustring16<TAlloc> operator+(const short left, const ustring16<TAlloc>& right)
3408 {
3409         ustring16<TAlloc> ret((core::stringc(left)));
3410         ret += right;
3411         return ret;
3412 }
3413
3414
3415 //! Appends a ustring16 and an unsigned short.
3416 template <typename TAlloc>
3417 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned short right)
3418 {
3419         ustring16<TAlloc> ret(left);
3420         ret += core::stringc(right);
3421         return ret;
3422 }
3423
3424
3425 //! Appends a ustring16 and an unsigned short.
3426 template <typename TAlloc>
3427 inline ustring16<TAlloc> operator+(const unsigned short left, const ustring16<TAlloc>& right)
3428 {
3429         ustring16<TAlloc> ret((core::stringc(left)));
3430         ret += right;
3431         return ret;
3432 }
3433
3434
3435 //! Appends a ustring16 and an int.
3436 template <typename TAlloc>
3437 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const int right)
3438 {
3439         ustring16<TAlloc> ret(left);
3440         ret += core::stringc(right);
3441         return ret;
3442 }
3443
3444
3445 //! Appends a ustring16 and an int.
3446 template <typename TAlloc>
3447 inline ustring16<TAlloc> operator+(const int left, const ustring16<TAlloc>& right)
3448 {
3449         ustring16<TAlloc> ret((core::stringc(left)));
3450         ret += right;
3451         return ret;
3452 }
3453
3454
3455 //! Appends a ustring16 and an unsigned int.
3456 template <typename TAlloc>
3457 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned int right)
3458 {
3459         ustring16<TAlloc> ret(left);
3460         ret += core::stringc(right);
3461         return ret;
3462 }
3463
3464
3465 //! Appends a ustring16 and an unsigned int.
3466 template <typename TAlloc>
3467 inline ustring16<TAlloc> operator+(const unsigned int left, const ustring16<TAlloc>& right)
3468 {
3469         ustring16<TAlloc> ret((core::stringc(left)));
3470         ret += right;
3471         return ret;
3472 }
3473
3474
3475 //! Appends a ustring16 and a long.
3476 template <typename TAlloc>
3477 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const long right)
3478 {
3479         ustring16<TAlloc> ret(left);
3480         ret += core::stringc(right);
3481         return ret;
3482 }
3483
3484
3485 //! Appends a ustring16 and a long.
3486 template <typename TAlloc>
3487 inline ustring16<TAlloc> operator+(const long left, const ustring16<TAlloc>& right)
3488 {
3489         ustring16<TAlloc> ret((core::stringc(left)));
3490         ret += right;
3491         return ret;
3492 }
3493
3494
3495 //! Appends a ustring16 and an unsigned long.
3496 template <typename TAlloc>
3497 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned long right)
3498 {
3499         ustring16<TAlloc> ret(left);
3500         ret += core::stringc(right);
3501         return ret;
3502 }
3503
3504
3505 //! Appends a ustring16 and an unsigned long.
3506 template <typename TAlloc>
3507 inline ustring16<TAlloc> operator+(const unsigned long left, const ustring16<TAlloc>& right)
3508 {
3509         ustring16<TAlloc> ret((core::stringc(left)));
3510         ret += right;
3511         return ret;
3512 }
3513
3514
3515 //! Appends a ustring16 and a float.
3516 template <typename TAlloc>
3517 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const float right)
3518 {
3519         ustring16<TAlloc> ret(left);
3520         ret += core::stringc(right);
3521         return ret;
3522 }
3523
3524
3525 //! Appends a ustring16 and a float.
3526 template <typename TAlloc>
3527 inline ustring16<TAlloc> operator+(const float left, const ustring16<TAlloc>& right)
3528 {
3529         ustring16<TAlloc> ret((core::stringc(left)));
3530         ret += right;
3531         return ret;
3532 }
3533
3534
3535 //! Appends a ustring16 and a double.
3536 template <typename TAlloc>
3537 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const double right)
3538 {
3539         ustring16<TAlloc> ret(left);
3540         ret += core::stringc(right);
3541         return ret;
3542 }
3543
3544
3545 //! Appends a ustring16 and a double.
3546 template <typename TAlloc>
3547 inline ustring16<TAlloc> operator+(const double left, const ustring16<TAlloc>& right)
3548 {
3549         ustring16<TAlloc> ret((core::stringc(left)));
3550         ret += right;
3551         return ret;
3552 }
3553
3554
3555 #ifdef USTRING_CPP0X
3556 //! Appends two ustring16s.
3557 template <typename TAlloc>
3558 inline ustring16<TAlloc>&& operator+(const ustring16<TAlloc>& left, ustring16<TAlloc>&& right)
3559 {
3560         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3561         right.insert(left, 0);
3562         return std::move(right);
3563 }
3564
3565
3566 //! Appends two ustring16s.
3567 template <typename TAlloc>
3568 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const ustring16<TAlloc>& right)
3569 {
3570         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3571         left.append(right);
3572         return std::move(left);
3573 }
3574
3575
3576 //! Appends two ustring16s.
3577 template <typename TAlloc>
3578 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, ustring16<TAlloc>&& right)
3579 {
3580         //std::cout << "MOVE operator+(&&, &&)" << std::endl;
3581         if ((right.size_raw() <= left.capacity() - left.size_raw()) ||
3582                 (right.capacity() - right.size_raw() < left.size_raw()))
3583         {
3584                 left.append(right);
3585                 return std::move(left);
3586         }
3587         else
3588         {
3589                 right.insert(left, 0);
3590                 return std::move(right);
3591         }
3592 }
3593
3594
3595 //! Appends a ustring16 and a null-terminated unicode string.
3596 template <typename TAlloc, class B>
3597 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const B* const right)
3598 {
3599         //std::cout << "MOVE operator+(&&, B*)" << std::endl;
3600         left.append(right);
3601         return std::move(left);
3602 }
3603
3604
3605 //! Appends a ustring16 and a null-terminated unicode string.
3606 template <class B, typename TAlloc>
3607 inline ustring16<TAlloc>&& operator+(const B* const left, ustring16<TAlloc>&& right)
3608 {
3609         //std::cout << "MOVE operator+(B*, &&)" << std::endl;
3610         right.insert(left, 0);
3611         return std::move(right);
3612 }
3613
3614
3615 //! Appends a ustring16 and an Irrlicht string.
3616 template <typename TAlloc, typename B, typename BAlloc>
3617 inline ustring16<TAlloc>&& operator+(const string<B, BAlloc>& left, ustring16<TAlloc>&& right)
3618 {
3619         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3620         right.insert(left, 0);
3621         return std::move(right);
3622 }
3623
3624
3625 //! Appends a ustring16 and an Irrlicht string.
3626 template <typename TAlloc, typename B, typename BAlloc>
3627 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const string<B, BAlloc>& right)
3628 {
3629         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3630         left.append(right);
3631         return std::move(left);
3632 }
3633
3634
3635 //! Appends a ustring16 and a std::basic_string.
3636 template <typename TAlloc, typename B, typename A, typename BAlloc>
3637 inline ustring16<TAlloc>&& operator+(const std::basic_string<B, A, BAlloc>& left, ustring16<TAlloc>&& right)
3638 {
3639         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3640         right.insert(core::ustring16<TAlloc>(left), 0);
3641         return std::move(right);
3642 }
3643
3644
3645 //! Appends a ustring16 and a std::basic_string.
3646 template <typename TAlloc, typename B, typename A, typename BAlloc>
3647 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const std::basic_string<B, A, BAlloc>& right)
3648 {
3649         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3650         left.append(right);
3651         return std::move(left);
3652 }
3653
3654
3655 //! Appends a ustring16 and a char.
3656 template <typename TAlloc>
3657 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const char right)
3658 {
3659         left.append((uchar32_t)right);
3660         return std::move(left);
3661 }
3662
3663
3664 //! Appends a ustring16 and a char.
3665 template <typename TAlloc>
3666 inline ustring16<TAlloc> operator+(const char left, ustring16<TAlloc>&& right)
3667 {
3668         right.insert((uchar32_t)left, 0);
3669         return std::move(right);
3670 }
3671
3672
3673 #ifdef USTRING_CPP0X_NEWLITERALS
3674 //! Appends a ustring16 and a uchar32_t.
3675 template <typename TAlloc>
3676 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const uchar32_t right)
3677 {
3678         left.append(right);
3679         return std::move(left);
3680 }
3681
3682
3683 //! Appends a ustring16 and a uchar32_t.
3684 template <typename TAlloc>
3685 inline ustring16<TAlloc> operator+(const uchar32_t left, ustring16<TAlloc>&& right)
3686 {
3687         right.insert(left, 0);
3688         return std::move(right);
3689 }
3690 #endif
3691
3692
3693 //! Appends a ustring16 and a short.
3694 template <typename TAlloc>
3695 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const short right)
3696 {
3697         left.append(core::stringc(right));
3698         return std::move(left);
3699 }
3700
3701
3702 //! Appends a ustring16 and a short.
3703 template <typename TAlloc>
3704 inline ustring16<TAlloc> operator+(const short left, ustring16<TAlloc>&& right)
3705 {
3706         right.insert(core::stringc(left), 0);
3707         return std::move(right);
3708 }
3709
3710
3711 //! Appends a ustring16 and an unsigned short.
3712 template <typename TAlloc>
3713 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned short right)
3714 {
3715         left.append(core::stringc(right));
3716         return std::move(left);
3717 }
3718
3719
3720 //! Appends a ustring16 and an unsigned short.
3721 template <typename TAlloc>
3722 inline ustring16<TAlloc> operator+(const unsigned short left, ustring16<TAlloc>&& right)
3723 {
3724         right.insert(core::stringc(left), 0);
3725         return std::move(right);
3726 }
3727
3728
3729 //! Appends a ustring16 and an int.
3730 template <typename TAlloc>
3731 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const int right)
3732 {
3733         left.append(core::stringc(right));
3734         return std::move(left);
3735 }
3736
3737
3738 //! Appends a ustring16 and an int.
3739 template <typename TAlloc>
3740 inline ustring16<TAlloc> operator+(const int left, ustring16<TAlloc>&& right)
3741 {
3742         right.insert(core::stringc(left), 0);
3743         return std::move(right);
3744 }
3745
3746
3747 //! Appends a ustring16 and an unsigned int.
3748 template <typename TAlloc>
3749 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned int right)
3750 {
3751         left.append(core::stringc(right));
3752         return std::move(left);
3753 }
3754
3755
3756 //! Appends a ustring16 and an unsigned int.
3757 template <typename TAlloc>
3758 inline ustring16<TAlloc> operator+(const unsigned int left, ustring16<TAlloc>&& right)
3759 {
3760         right.insert(core::stringc(left), 0);
3761         return std::move(right);
3762 }
3763
3764
3765 //! Appends a ustring16 and a long.
3766 template <typename TAlloc>
3767 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const long right)
3768 {
3769         left.append(core::stringc(right));
3770         return std::move(left);
3771 }
3772
3773
3774 //! Appends a ustring16 and a long.
3775 template <typename TAlloc>
3776 inline ustring16<TAlloc> operator+(const long left, ustring16<TAlloc>&& right)
3777 {
3778         right.insert(core::stringc(left), 0);
3779         return std::move(right);
3780 }
3781
3782
3783 //! Appends a ustring16 and an unsigned long.
3784 template <typename TAlloc>
3785 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned long right)
3786 {
3787         left.append(core::stringc(right));
3788         return std::move(left);
3789 }
3790
3791
3792 //! Appends a ustring16 and an unsigned long.
3793 template <typename TAlloc>
3794 inline ustring16<TAlloc> operator+(const unsigned long left, ustring16<TAlloc>&& right)
3795 {
3796         right.insert(core::stringc(left), 0);
3797         return std::move(right);
3798 }
3799
3800
3801 //! Appends a ustring16 and a float.
3802 template <typename TAlloc>
3803 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const float right)
3804 {
3805         left.append(core::stringc(right));
3806         return std::move(left);
3807 }
3808
3809
3810 //! Appends a ustring16 and a float.
3811 template <typename TAlloc>
3812 inline ustring16<TAlloc> operator+(const float left, ustring16<TAlloc>&& right)
3813 {
3814         right.insert(core::stringc(left), 0);
3815         return std::move(right);
3816 }
3817
3818
3819 //! Appends a ustring16 and a double.
3820 template <typename TAlloc>
3821 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const double right)
3822 {
3823         left.append(core::stringc(right));
3824         return std::move(left);
3825 }
3826
3827
3828 //! Appends a ustring16 and a double.
3829 template <typename TAlloc>
3830 inline ustring16<TAlloc> operator+(const double left, ustring16<TAlloc>&& right)
3831 {
3832         right.insert(core::stringc(left), 0);
3833         return std::move(right);
3834 }
3835 #endif
3836
3837
3838 #ifndef USTRING_NO_STL
3839 //! Writes a ustring16 to an ostream.
3840 template <typename TAlloc>
3841 inline std::ostream& operator<<(std::ostream& out, const ustring16<TAlloc>& in)
3842 {
3843         out << in.toUTF8_s().c_str();
3844         return out;
3845 }
3846
3847 //! Writes a ustring16 to a wostream.
3848 template <typename TAlloc>
3849 inline std::wostream& operator<<(std::wostream& out, const ustring16<TAlloc>& in)
3850 {
3851         out << in.toWCHAR_s().c_str();
3852         return out;
3853 }
3854 #endif
3855
3856
3857 #ifndef USTRING_NO_STL
3858
3859 namespace unicode
3860 {
3861
3862 //! Hashing algorithm for hashing a ustring.  Used for things like unordered_maps.
3863 //! Algorithm taken from std::hash<std::string>.
3864 class hash : public std::unary_function<core::ustring, size_t>
3865 {
3866         public:
3867                 size_t operator()(const core::ustring& s) const
3868                 {
3869                         size_t ret = 2166136261U;
3870                         size_t index = 0;
3871                         size_t stride = 1 + s.size_raw() / 10;
3872
3873                         core::ustring::const_iterator i = s.begin();
3874                         while (i != s.end())
3875                         {
3876                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
3877                                 ret = 16777619U * ret ^ (size_t)s[(u32)index];
3878                                 index += stride;
3879                                 i += stride;
3880                         }
3881                         return (ret);
3882                 }
3883 };
3884
3885 } // end namespace unicode
3886
3887 #endif
3888
3889 } // end namespace core
3890 } // end namespace irr
3891
3892 #endif