src/cguittfont/irrUString.h

   1 /*
   2    Basic Unicode string class for Irrlicht.
   3    Copyright (c) 2009-2011 John Norman
   4
   5    This software is provided 'as-is', without any express or implied
   6    warranty. In no event will the authors be held liable for any
   7    damages arising from the use of this software.
   8
   9    Permission is granted to anyone to use this software for any
  10    purpose, including commercial applications, and to alter it and
  11    redistribute it freely, subject to the following restrictions:
  12
  13    1. The origin of this software must not be misrepresented; you
  14       must not claim that you wrote the original software. If you use
  15       this software in a product, an acknowledgment in the product
  16       documentation would be appreciated but is not required.
  17
  18    2. Altered source versions must be plainly marked as such, and
  19       must not be misrepresented as being the original software.
  20
  21    3. This notice may not be removed or altered from any source
  22       distribution.
  23
  24    The original version of this class can be located at:
  25    http://irrlicht.suckerfreegames.com/
  26
  27    John Norman
  28    john@suckerfreegames.com
  29 */
  30
  31 #ifndef __IRR_USTRING_H_INCLUDED__
  32 #define __IRR_USTRING_H_INCLUDED__
  33
  34 #if (__cplusplus > 199711L) || (_MSC_VER >= 1600) || defined(__GXX_EXPERIMENTAL_CXX0X__)
  35 #       define USTRING_CPP0X
  36 #       if defined(__GXX_EXPERIMENTAL_CXX0X__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
  37 #               define USTRING_CPP0X_NEWLITERALS
  38 #       endif
  39 #endif
  40
  41 #include <stdio.h>
  42 #include <string.h>
  43 #include <stdlib.h>
  44 #include <endian.h>
  45
  46 #ifdef USTRING_CPP0X
  47 #       include <utility>
  48 #endif
  49
  50 #ifndef USTRING_NO_STL
  51 #       include <string>
  52 #       include <iterator>
  53 #       include <ostream>
  54 #endif
  55
  56 #include "irrTypes.h"
  57 #include "irrAllocator.h"
  58 #include "irrArray.h"
  59 #include "irrMath.h"
  60 #include "irrString.h"
  61 #include "path.h"
  62
  63 //! UTF-16 surrogate start values.
  64 static const irr::u16 UTF16_HI_SURROGATE = 0xD800;
  65 static const irr::u16 UTF16_LO_SURROGATE = 0xDC00;
  66
  67 //! Is a UTF-16 code point a surrogate?
  68 #define UTF16_IS_SURROGATE(c)           (((c) & 0xF800) == 0xD800)
  69 #define UTF16_IS_SURROGATE_HI(c)        (((c) & 0xFC00) == 0xD800)
  70 #define UTF16_IS_SURROGATE_LO(c)        (((c) & 0xFC00) == 0xDC00)
  71
  72
  73 namespace irr
  74 {
  75
  76         // Define our character types.
  77 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
  78         typedef char32_t uchar32_t;
  79         typedef char16_t uchar16_t;
  80         typedef char uchar8_t;
  81 #else
  82         typedef u32 uchar32_t;
  83         typedef u16 uchar16_t;
  84         typedef u8 uchar8_t;
  85 #endif
  86
  87 namespace core
  88 {
  89
  90 namespace unicode
  91 {
  92
  93 //! The unicode replacement character.  Used to replace invalid characters.
  94 const irr::u16 UTF_REPLACEMENT_CHARACTER = 0xFFFD;
  95
  96 //! Convert a UTF-16 surrogate pair into a UTF-32 character.
  97 //! \param high The high value of the pair.
  98 //! \param low The low value of the pair.
  99 //! \return The UTF-32 character expressed by the surrogate pair.
 100 inline uchar32_t toUTF32(uchar16_t high, uchar16_t low)
 101 {
 102         // Convert the surrogate pair into a single UTF-32 character.
 103         uchar32_t x = ((high & ((1 << 6) -1)) << 10) | (low & ((1 << 10) -1));
 104         uchar32_t wu = ((high >> 6) & ((1 << 5) - 1)) + 1;
 105         return (wu << 16) | x;
 106 }
 107
 108 //! Swaps the endianness of a 16-bit value.
 109 //! \return The new value.
 110 inline uchar16_t swapEndian16(const uchar16_t& c)
 111 {
 112         return ((c >> 8) & 0x00FF) | ((c << 8) & 0xFF00);
 113 }
 114
 115 //! Swaps the endianness of a 32-bit value.
 116 //! \return The new value.
 117 inline uchar32_t swapEndian32(const uchar32_t& c)
 118 {
 119         return  ((c >> 24) & 0x000000FF) |
 120                         ((c >> 8)  & 0x0000FF00) |
 121                         ((c << 8)  & 0x00FF0000) |
 122                         ((c << 24) & 0xFF000000);
 123 }
 124
 125 //! The Unicode byte order mark.
 126 const u16 BOM = 0xFEFF;
 127
 128 //! The size of the Unicode byte order mark in terms of the Unicode character size.
 129 const u8 BOM_UTF8_LEN = 3;
 130 const u8 BOM_UTF16_LEN = 1;
 131 const u8 BOM_UTF32_LEN = 1;
 132
 133 //! Unicode byte order marks for file operations.
 134 const u8 BOM_ENCODE_UTF8[3] = { 0xEF, 0xBB, 0xBF };
 135 const u8 BOM_ENCODE_UTF16_BE[2] = { 0xFE, 0xFF };
 136 const u8 BOM_ENCODE_UTF16_LE[2] = { 0xFF, 0xFE };
 137 const u8 BOM_ENCODE_UTF32_BE[4] = { 0x00, 0x00, 0xFE, 0xFF };
 138 const u8 BOM_ENCODE_UTF32_LE[4] = { 0xFF, 0xFE, 0x00, 0x00 };
 139
 140 //! The size in bytes of the Unicode byte marks for file operations.
 141 const u8 BOM_ENCODE_UTF8_LEN = 3;
 142 const u8 BOM_ENCODE_UTF16_LEN = 2;
 143 const u8 BOM_ENCODE_UTF32_LEN = 4;
 144
 145 //! Unicode encoding type.
 146 enum EUTF_ENCODE
 147 {
 148         EUTFE_NONE              = 0,
 149         EUTFE_UTF8,
 150         EUTFE_UTF16,
 151         EUTFE_UTF16_LE,
 152         EUTFE_UTF16_BE,
 153         EUTFE_UTF32,
 154         EUTFE_UTF32_LE,
 155         EUTFE_UTF32_BE
 156 };
 157
 158 //! Unicode endianness.
 159 enum EUTF_ENDIAN
 160 {
 161         EUTFEE_NATIVE   = 0,
 162         EUTFEE_LITTLE,
 163         EUTFEE_BIG
 164 };
 165
 166 //! Returns the specified unicode byte order mark in a byte array.
 167 //! The byte order mark is the first few bytes in a text file that signifies its encoding.
 168 /** \param mode The Unicode encoding method that we want to get the byte order mark for.
 169                 If EUTFE_UTF16 or EUTFE_UTF32 is passed, it uses the native system endianness. **/
 170 //! \return An array that contains a byte order mark.
 171 inline core::array<u8> getUnicodeBOM(EUTF_ENCODE mode)
 172 {
 173 #define COPY_ARRAY(source, size) \
 174         memcpy(ret.pointer(), source, size); \
 175         ret.set_used(size)
 176
 177         core::array<u8> ret(4);
 178         switch (mode)
 179         {
 180                 case EUTFE_UTF8:
 181                         COPY_ARRAY(BOM_ENCODE_UTF8, BOM_ENCODE_UTF8_LEN);
 182                         break;
 183                 case EUTFE_UTF16:
 184                         #ifdef __BIG_ENDIAN__
 185                                 COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
 186                         #else
 187                                 COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
 188                         #endif
 189                         break;
 190                 case EUTFE_UTF16_BE:
 191                         COPY_ARRAY(BOM_ENCODE_UTF16_BE, BOM_ENCODE_UTF16_LEN);
 192                         break;
 193                 case EUTFE_UTF16_LE:
 194                         COPY_ARRAY(BOM_ENCODE_UTF16_LE, BOM_ENCODE_UTF16_LEN);
 195                         break;
 196                 case EUTFE_UTF32:
 197                         #ifdef __BIG_ENDIAN__
 198                                 COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
 199                         #else
 200                                 COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
 201                         #endif
 202                         break;
 203                 case EUTFE_UTF32_BE:
 204                         COPY_ARRAY(BOM_ENCODE_UTF32_BE, BOM_ENCODE_UTF32_LEN);
 205                         break;
 206                 case EUTFE_UTF32_LE:
 207                         COPY_ARRAY(BOM_ENCODE_UTF32_LE, BOM_ENCODE_UTF32_LEN);
 208                         break;
 209                 case EUTFE_NONE:
 210                         // TODO sapier: fixed warning only,
 211                         // don't know if something needs to be done here
 212                         break;
 213         }
 214         return ret;
 215
 216 #undef COPY_ARRAY
 217 }
 218
 219 //! Detects if the given data stream starts with a unicode BOM.
 220 //! \param data The data stream to check.
 221 //! \return The unicode BOM associated with the data stream, or EUTFE_NONE if none was found.
 222 inline EUTF_ENCODE determineUnicodeBOM(const char* data)
 223 {
 224         if (memcmp(data, BOM_ENCODE_UTF8, 3) == 0) return EUTFE_UTF8;
 225         if (memcmp(data, BOM_ENCODE_UTF16_BE, 2) == 0) return EUTFE_UTF16_BE;
 226         if (memcmp(data, BOM_ENCODE_UTF16_LE, 2) == 0) return EUTFE_UTF16_LE;
 227         if (memcmp(data, BOM_ENCODE_UTF32_BE, 4) == 0) return EUTFE_UTF32_BE;
 228         if (memcmp(data, BOM_ENCODE_UTF32_LE, 4) == 0) return EUTFE_UTF32_LE;
 229         return EUTFE_NONE;
 230 }
 231
 232 } // end namespace unicode
 233
 234
 235 //! UTF-16 string class.
 236 template <typename TAlloc = irrAllocator<uchar16_t> >
 237 class ustring16
 238 {
 239 public:
 240
 241         ///------------------///
 242         /// iterator classes ///
 243         ///------------------///
 244
 245         //! Access an element in a unicode string, allowing one to change it.
 246         class _ustring16_iterator_access
 247         {
 248                 public:
 249                         _ustring16_iterator_access(const ustring16<TAlloc>* s, u32 p) : ref(s), pos(p) {}
 250
 251                         //! Allow the class to be interpreted as a single UTF-32 character.
 252                         operator uchar32_t() const
 253                         {
 254                                 return _get();
 255                         }
 256
 257                         //! Allow one to change the character in the unicode string.
 258                         //! \param c The new character to use.
 259                         //! \return Myself.
 260                         _ustring16_iterator_access& operator=(const uchar32_t c)
 261                         {
 262                                 _set(c);
 263                                 return *this;
 264                         }
 265
 266                         //! Increments the value by 1.
 267                         //! \return Myself.
 268                         _ustring16_iterator_access& operator++()
 269                         {
 270                                 _set(_get() + 1);
 271                                 return *this;
 272                         }
 273
 274                         //! Increments the value by 1, returning the old value.
 275                         //! \return A unicode character.
 276                         uchar32_t operator++(int)
 277                         {
 278                                 uchar32_t old = _get();
 279                                 _set(old + 1);
 280                                 return old;
 281                         }
 282
 283                         //! Decrements the value by 1.
 284                         //! \return Myself.
 285                         _ustring16_iterator_access& operator--()
 286                         {
 287                                 _set(_get() - 1);
 288                                 return *this;
 289                         }
 290
 291                         //! Decrements the value by 1, returning the old value.
 292                         //! \return A unicode character.
 293                         uchar32_t operator--(int)
 294                         {
 295                                 uchar32_t old = _get();
 296                                 _set(old - 1);
 297                                 return old;
 298                         }
 299
 300                         //! Adds to the value by a specified amount.
 301                         //! \param val The amount to add to this character.
 302                         //! \return Myself.
 303                         _ustring16_iterator_access& operator+=(int val)
 304                         {
 305                                 _set(_get() + val);
 306                                 return *this;
 307                         }
 308
 309                         //! Subtracts from the value by a specified amount.
 310                         //! \param val The amount to subtract from this character.
 311                         //! \return Myself.
 312                         _ustring16_iterator_access& operator-=(int val)
 313                         {
 314                                 _set(_get() - val);
 315                                 return *this;
 316                         }
 317
 318                         //! Multiples the value by a specified amount.
 319                         //! \param val The amount to multiply this character by.
 320                         //! \return Myself.
 321                         _ustring16_iterator_access& operator*=(int val)
 322                         {
 323                                 _set(_get() * val);
 324                                 return *this;
 325                         }
 326
 327                         //! Divides the value by a specified amount.
 328                         //! \param val The amount to divide this character by.
 329                         //! \return Myself.
 330                         _ustring16_iterator_access& operator/=(int val)
 331                         {
 332                                 _set(_get() / val);
 333                                 return *this;
 334                         }
 335
 336                         //! Modulos the value by a specified amount.
 337                         //! \param val The amount to modulo this character by.
 338                         //! \return Myself.
 339                         _ustring16_iterator_access& operator%=(int val)
 340                         {
 341                                 _set(_get() % val);
 342                                 return *this;
 343                         }
 344
 345                         //! Adds to the value by a specified amount.
 346                         //! \param val The amount to add to this character.
 347                         //! \return A unicode character.
 348                         uchar32_t operator+(int val) const
 349                         {
 350                                 return _get() + val;
 351                         }
 352
 353                         //! Subtracts from the value by a specified amount.
 354                         //! \param val The amount to subtract from this character.
 355                         //! \return A unicode character.
 356                         uchar32_t operator-(int val) const
 357                         {
 358                                 return _get() - val;
 359                         }
 360
 361                         //! Multiplies the value by a specified amount.
 362                         //! \param val The amount to multiply this character by.
 363                         //! \return A unicode character.
 364                         uchar32_t operator*(int val) const
 365                         {
 366                                 return _get() * val;
 367                         }
 368
 369                         //! Divides the value by a specified amount.
 370                         //! \param val The amount to divide this character by.
 371                         //! \return A unicode character.
 372                         uchar32_t operator/(int val) const
 373                         {
 374                                 return _get() / val;
 375                         }
 376
 377                         //! Modulos the value by a specified amount.
 378                         //! \param val The amount to modulo this character by.
 379                         //! \return A unicode character.
 380                         uchar32_t operator%(int val) const
 381                         {
 382                                 return _get() % val;
 383                         }
 384
 385                 private:
 386                         //! Gets a uchar32_t from our current position.
 387                         uchar32_t _get() const
 388                         {
 389                                 const uchar16_t* a = ref->c_str();
 390                                 if (!UTF16_IS_SURROGATE(a[pos]))
 391                                         return static_cast<uchar32_t>(a[pos]);
 392                                 else
 393                                 {
 394                                         if (pos + 1 >= ref->size_raw())
 395                                                 return 0;
 396
 397                                         return unicode::toUTF32(a[pos], a[pos + 1]);
 398                                 }
 399                         }
 400
 401                         //! Sets a uchar32_t at our current position.
 402                         void _set(uchar32_t c)
 403                         {
 404                                 ustring16<TAlloc>* ref2 = const_cast<ustring16<TAlloc>*>(ref);
 405                                 const uchar16_t* a = ref2->c_str();
 406                                 if (c > 0xFFFF)
 407                                 {
 408                                         // c will be multibyte, so split it up into the high and low surrogate pairs.
 409                                         uchar16_t x = static_cast<uchar16_t>(c);
 410                                         uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
 411                                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
 412
 413                                         // If the previous position was a surrogate pair, just replace them.  Else, insert the low pair.
 414                                         if (UTF16_IS_SURROGATE_HI(a[pos]) && pos + 1 != ref2->size_raw())
 415                                                 ref2->replace_raw(vl, static_cast<u32>(pos) + 1);
 416                                         else ref2->insert_raw(vl, static_cast<u32>(pos) + 1);
 417
 418                                         ref2->replace_raw(vh, static_cast<u32>(pos));
 419                                 }
 420                                 else
 421                                 {
 422                                         // c will be a single byte.
 423                                         uchar16_t vh = static_cast<uchar16_t>(c);
 424
 425                                         // If the previous position was a surrogate pair, remove the extra byte.
 426                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 427                                                 ref2->erase_raw(static_cast<u32>(pos) + 1);
 428
 429                                         ref2->replace_raw(vh, static_cast<u32>(pos));
 430                                 }
 431                         }
 432
 433                         const ustring16<TAlloc>* ref;
 434                         u32 pos;
 435         };
 436         typedef typename ustring16<TAlloc>::_ustring16_iterator_access access;
 437
 438
 439         //! Iterator to iterate through a UTF-16 string.
 440 #ifndef USTRING_NO_STL
 441         class _ustring16_const_iterator : public std::iterator<
 442                 std::bidirectional_iterator_tag,        // iterator_category
 443                 access,                                                         // value_type
 444                 ptrdiff_t,                                                      // difference_type
 445                 const access,                                           // pointer
 446                 const access                                            // reference
 447         >
 448 #else
 449         class _ustring16_const_iterator
 450 #endif
 451         {
 452                 public:
 453                         typedef _ustring16_const_iterator _Iter;
 454                         typedef std::iterator<std::bidirectional_iterator_tag, access, ptrdiff_t, const access, const access> _Base;
 455                         typedef const access const_pointer;
 456                         typedef const access const_reference;
 457
 458 #ifndef USTRING_NO_STL
 459                         typedef typename _Base::value_type value_type;
 460                         typedef typename _Base::difference_type difference_type;
 461                         typedef typename _Base::difference_type distance_type;
 462                         typedef typename _Base::pointer pointer;
 463                         typedef const_reference reference;
 464 #else
 465                         typedef access value_type;
 466                         typedef u32 difference_type;
 467                         typedef u32 distance_type;
 468                         typedef const_pointer pointer;
 469                         typedef const_reference reference;
 470 #endif
 471
 472                         //! Constructors.
 473                         _ustring16_const_iterator(const _Iter& i) : ref(i.ref), pos(i.pos) {}
 474                         _ustring16_const_iterator(const ustring16<TAlloc>& s) : ref(&s), pos(0) {}
 475                         _ustring16_const_iterator(const ustring16<TAlloc>& s, const u32 p) : ref(&s), pos(0)
 476                         {
 477                                 if (ref->size_raw() == 0 || p == 0)
 478                                         return;
 479
 480                                 // Go to the appropriate position.
 481                                 u32 i = p;
 482                                 u32 sr = ref->size_raw();
 483                                 const uchar16_t* a = ref->c_str();
 484                                 while (i != 0 && pos < sr)
 485                                 {
 486                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 487                                                 pos += 2;
 488                                         else ++pos;
 489                                         --i;
 490                                 }
 491                         }
 492
 493                         //! Test for equalness.
 494                         bool operator==(const _Iter& iter) const
 495                         {
 496                                 if (ref == iter.ref && pos == iter.pos)
 497                                         return true;
 498                                 return false;
 499                         }
 500
 501                         //! Test for unequalness.
 502                         bool operator!=(const _Iter& iter) const
 503                         {
 504                                 if (ref != iter.ref || pos != iter.pos)
 505                                         return true;
 506                                 return false;
 507                         }
 508
 509                         //! Switch to the next full character in the string.
 510                         _Iter& operator++()
 511                         {       // ++iterator
 512                                 if (pos == ref->size_raw()) return *this;
 513                                 const uchar16_t* a = ref->c_str();
 514                                 if (UTF16_IS_SURROGATE_HI(a[pos]))
 515                                         pos += 2;                       // TODO: check for valid low surrogate?
 516                                 else ++pos;
 517                                 if (pos > ref->size_raw()) pos = ref->size_raw();
 518                                 return *this;
 519                         }
 520
 521                         //! Switch to the next full character in the string, returning the previous position.
 522                         _Iter operator++(int)
 523                         {       // iterator++
 524                                 _Iter _tmp(*this);
 525                                 ++*this;
 526                                 return _tmp;
 527                         }
 528
 529                         //! Switch to the previous full character in the string.
 530                         _Iter& operator--()
 531                         {       // --iterator
 532                                 if (pos == 0) return *this;
 533                                 const uchar16_t* a = ref->c_str();
 534                                 --pos;
 535                                 if (UTF16_IS_SURROGATE_LO(a[pos]) && pos != 0)  // low surrogate, go back one more.
 536                                         --pos;
 537                                 return *this;
 538                         }
 539
 540                         //! Switch to the previous full character in the string, returning the previous position.
 541                         _Iter operator--(int)
 542                         {       // iterator--
 543                                 _Iter _tmp(*this);
 544                                 --*this;
 545                                 return _tmp;
 546                         }
 547
 548                         //! Advance a specified number of full characters in the string.
 549                         //! \return Myself.
 550                         _Iter& operator+=(const difference_type v)
 551                         {
 552                                 if (v == 0) return *this;
 553                                 if (v < 0) return operator-=(v * -1);
 554
 555                                 if (pos >= ref->size_raw())
 556                                         return *this;
 557
 558                                 // Go to the appropriate position.
 559                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
 560                                 u32 i = (u32)v;
 561                                 u32 sr = ref->size_raw();
 562                                 const uchar16_t* a = ref->c_str();
 563                                 while (i != 0 && pos < sr)
 564                                 {
 565                                         if (UTF16_IS_SURROGATE_HI(a[pos]))
 566                                                 pos += 2;
 567                                         else ++pos;
 568                                         --i;
 569                                 }
 570                                 if (pos > sr)
 571                                         pos = sr;
 572
 573                                 return *this;
 574                         }
 575
 576                         //! Go back a specified number of full characters in the string.
 577                         //! \return Myself.
 578                         _Iter& operator-=(const difference_type v)
 579                         {
 580                                 if (v == 0) return *this;
 581                                 if (v > 0) return operator+=(v * -1);
 582
 583                                 if (pos == 0)
 584                                         return *this;
 585
 586                                 // Go to the appropriate position.
 587                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
 588                                 u32 i = (u32)v;
 589                                 const uchar16_t* a = ref->c_str();
 590                                 while (i != 0 && pos != 0)
 591                                 {
 592                                         --pos;
 593                                         if (UTF16_IS_SURROGATE_LO(a[pos]) != 0 && pos != 0)
 594                                                 --pos;
 595                                         --i;
 596                                 }
 597
 598                                 return *this;
 599                         }
 600
 601                         //! Return a new iterator that is a variable number of full characters forward from the current position.
 602                         _Iter operator+(const difference_type v) const
 603                         {
 604                                 _Iter ret(*this);
 605                                 ret += v;
 606                                 return ret;
 607                         }
 608
 609                         //! Return a new iterator that is a variable number of full characters backward from the current position.
 610                         _Iter operator-(const difference_type v) const
 611                         {
 612                                 _Iter ret(*this);
 613                                 ret -= v;
 614                                 return ret;
 615                         }
 616
 617                         //! Returns the distance between two iterators.
 618                         difference_type operator-(const _Iter& iter) const
 619                         {
 620                                 // Make sure we reference the same object!
 621                                 if (ref != iter.ref)
 622                                         return difference_type();
 623
 624                                 _Iter i = iter;
 625                                 difference_type ret;
 626
 627                                 // Walk up.
 628                                 if (pos > i.pos)
 629                                 {
 630                                         while (pos > i.pos)
 631                                         {
 632                                                 ++i;
 633                                                 ++ret;
 634                                         }
 635                                         return ret;
 636                                 }
 637
 638                                 // Walk down.
 639                                 while (pos < i.pos)
 640                                 {
 641                                         --i;
 642                                         --ret;
 643                                 }
 644                                 return ret;
 645                         }
 646
 647                         //! Accesses the full character at the iterator's position.
 648                         const_reference operator*() const
 649                         {
 650                                 if (pos >= ref->size_raw())
 651                                 {
 652                                         const uchar16_t* a = ref->c_str();
 653                                         u32 p = ref->size_raw();
 654                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 655                                                 --p;
 656                                         reference ret(ref, p);
 657                                         return ret;
 658                                 }
 659                                 const_reference ret(ref, pos);
 660                                 return ret;
 661                         }
 662
 663                         //! Accesses the full character at the iterator's position.
 664                         reference operator*()
 665                         {
 666                                 if (pos >= ref->size_raw())
 667                                 {
 668                                         const uchar16_t* a = ref->c_str();
 669                                         u32 p = ref->size_raw();
 670                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 671                                                 --p;
 672                                         reference ret(ref, p);
 673                                         return ret;
 674                                 }
 675                                 reference ret(ref, pos);
 676                                 return ret;
 677                         }
 678
 679                         //! Accesses the full character at the iterator's position.
 680                         const_pointer operator->() const
 681                         {
 682                                 return operator*();
 683                         }
 684
 685                         //! Accesses the full character at the iterator's position.
 686                         pointer operator->()
 687                         {
 688                                 return operator*();
 689                         }
 690
 691                         //! Is the iterator at the start of the string?
 692                         bool atStart() const
 693                         {
 694                                 return pos == 0;
 695                         }
 696
 697                         //! Is the iterator at the end of the string?
 698                         bool atEnd() const
 699                         {
 700                                 const uchar16_t* a = ref->c_str();
 701                                 if (UTF16_IS_SURROGATE(a[pos]))
 702                                         return (pos + 1) >= ref->size_raw();
 703                                 else return pos >= ref->size_raw();
 704                         }
 705
 706                         //! Moves the iterator to the start of the string.
 707                         void toStart()
 708                         {
 709                                 pos = 0;
 710                         }
 711
 712                         //! Moves the iterator to the end of the string.
 713                         void toEnd()
 714                         {
 715                                 pos = ref->size_raw();
 716                         }
 717
 718                         //! Returns the iterator's position.
 719                         //! \return The iterator's position.
 720                         u32 getPos() const
 721                         {
 722                                 return pos;
 723                         }
 724
 725                 protected:
 726                         const ustring16<TAlloc>* ref;
 727                         u32 pos;
 728         };
 729
 730         //! Iterator to iterate through a UTF-16 string.
 731         class _ustring16_iterator : public _ustring16_const_iterator
 732         {
 733                 public:
 734                         typedef _ustring16_iterator _Iter;
 735                         typedef _ustring16_const_iterator _Base;
 736                         typedef typename _Base::const_pointer const_pointer;
 737                         typedef typename _Base::const_reference const_reference;
 738
 739
 740                         typedef typename _Base::value_type value_type;
 741                         typedef typename _Base::difference_type difference_type;
 742                         typedef typename _Base::distance_type distance_type;
 743                         typedef access pointer;
 744                         typedef access reference;
 745
 746                         using _Base::pos;
 747                         using _Base::ref;
 748
 749                         //! Constructors.
 750                         _ustring16_iterator(const _Iter& i) : _ustring16_const_iterator(i) {}
 751                         _ustring16_iterator(const ustring16<TAlloc>& s) : _ustring16_const_iterator(s) {}
 752                         _ustring16_iterator(const ustring16<TAlloc>& s, const u32 p) : _ustring16_const_iterator(s, p) {}
 753
 754                         //! Accesses the full character at the iterator's position.
 755                         reference operator*() const
 756                         {
 757                                 if (pos >= ref->size_raw())
 758                                 {
 759                                         const uchar16_t* a = ref->c_str();
 760                                         u32 p = ref->size_raw();
 761                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 762                                                 --p;
 763                                         reference ret(ref, p);
 764                                         return ret;
 765                                 }
 766                                 reference ret(ref, pos);
 767                                 return ret;
 768                         }
 769
 770                         //! Accesses the full character at the iterator's position.
 771                         reference operator*()
 772                         {
 773                                 if (pos >= ref->size_raw())
 774                                 {
 775                                         const uchar16_t* a = ref->c_str();
 776                                         u32 p = ref->size_raw();
 777                                         if (UTF16_IS_SURROGATE_LO(a[p]))
 778                                                 --p;
 779                                         reference ret(ref, p);
 780                                         return ret;
 781                                 }
 782                                 reference ret(ref, pos);
 783                                 return ret;
 784                         }
 785
 786                         //! Accesses the full character at the iterator's position.
 787                         pointer operator->() const
 788                         {
 789                                 return operator*();
 790                         }
 791
 792                         //! Accesses the full character at the iterator's position.
 793                         pointer operator->()
 794                         {
 795                                 return operator*();
 796                         }
 797         };
 798
 799         typedef typename ustring16<TAlloc>::_ustring16_iterator iterator;
 800         typedef typename ustring16<TAlloc>::_ustring16_const_iterator const_iterator;
 801
 802         ///----------------------///
 803         /// end iterator classes ///
 804         ///----------------------///
 805
 806         //! Default constructor
 807         ustring16()
 808         : array(0), allocated(1), used(0)
 809         {
 810 #if __BYTE_ORDER == __BIG_ENDIAN
 811                 encoding = unicode::EUTFE_UTF16_BE;
 812 #else
 813                 encoding = unicode::EUTFE_UTF16_LE;
 814 #endif
 815                 array = allocator.allocate(1); // new u16[1];
 816                 array[0] = 0x0;
 817         }
 818
 819
 820         //! Constructor
 821         ustring16(const ustring16<TAlloc>& other)
 822         : array(0), allocated(0), used(0)
 823         {
 824 #if __BYTE_ORDER == __BIG_ENDIAN
 825                 encoding = unicode::EUTFE_UTF16_BE;
 826 #else
 827                 encoding = unicode::EUTFE_UTF16_LE;
 828 #endif
 829                 *this = other;
 830         }
 831
 832
 833         //! Constructor from other string types
 834         template <class B, class A>
 835         ustring16(const string<B, A>& other)
 836         : array(0), allocated(0), used(0)
 837         {
 838 #if __BYTE_ORDER == __BIG_ENDIAN
 839                 encoding = unicode::EUTFE_UTF16_BE;
 840 #else
 841                 encoding = unicode::EUTFE_UTF16_LE;
 842 #endif
 843                 *this = other;
 844         }
 845
 846
 847 #ifndef USTRING_NO_STL
 848         //! Constructor from std::string
 849         template <class B, class A, typename Alloc>
 850         ustring16(const std::basic_string<B, A, Alloc>& other)
 851         : array(0), allocated(0), used(0)
 852         {
 853 #if __BYTE_ORDER == __BIG_ENDIAN
 854                 encoding = unicode::EUTFE_UTF16_BE;
 855 #else
 856                 encoding = unicode::EUTFE_UTF16_LE;
 857 #endif
 858                 *this = other.c_str();
 859         }
 860
 861
 862         //! Constructor from iterator.
 863         template <typename Itr>
 864         ustring16(Itr first, Itr last)
 865         : array(0), allocated(0), used(0)
 866         {
 867 #if __BYTE_ORDER == __BIG_ENDIAN
 868                 encoding = unicode::EUTFE_UTF16_BE;
 869 #else
 870                 encoding = unicode::EUTFE_UTF16_LE;
 871 #endif
 872                 reserve(std::distance(first, last));
 873                 array[used] = 0;
 874
 875                 for (; first != last; ++first)
 876                         append((uchar32_t)*first);
 877         }
 878 #endif
 879
 880
 881 #ifndef USTRING_CPP0X_NEWLITERALS
 882         //! Constructor for copying a character string from a pointer.
 883         ustring16(const char* const c)
 884         : array(0), allocated(0), used(0)
 885         {
 886 #if __BYTE_ORDER == __BIG_ENDIAN
 887                 encoding = unicode::EUTFE_UTF16_BE;
 888 #else
 889                 encoding = unicode::EUTFE_UTF16_LE;
 890 #endif
 891
 892                 loadDataStream(c, strlen(c));
 893                 //append((uchar8_t*)c);
 894         }
 895
 896
 897         //! Constructor for copying a character string from a pointer with a given length.
 898         ustring16(const char* const c, u32 length)
 899         : array(0), allocated(0), used(0)
 900         {
 901 #if __BYTE_ORDER == __BIG_ENDIAN
 902                 encoding = unicode::EUTFE_UTF16_BE;
 903 #else
 904                 encoding = unicode::EUTFE_UTF16_LE;
 905 #endif
 906
 907                 loadDataStream(c, length);
 908         }
 909 #endif
 910
 911
 912         //! Constructor for copying a UTF-8 string from a pointer.
 913         ustring16(const uchar8_t* const c)
 914         : array(0), allocated(0), used(0)
 915         {
 916 #if __BYTE_ORDER == __BIG_ENDIAN
 917                 encoding = unicode::EUTFE_UTF16_BE;
 918 #else
 919                 encoding = unicode::EUTFE_UTF16_LE;
 920 #endif
 921
 922                 append(c);
 923         }
 924
 925
 926         //! Constructor for copying a UTF-8 string from a single char.
 927         ustring16(const char c)
 928         : array(0), allocated(0), used(0)
 929         {
 930 #if __BYTE_ORDER == __BIG_ENDIAN
 931                 encoding = unicode::EUTFE_UTF16_BE;
 932 #else
 933                 encoding = unicode::EUTFE_UTF16_LE;
 934 #endif
 935
 936                 append((uchar32_t)c);
 937         }
 938
 939
 940         //! Constructor for copying a UTF-8 string from a pointer with a given length.
 941         ustring16(const uchar8_t* const c, u32 length)
 942         : array(0), allocated(0), used(0)
 943         {
 944 #if __BYTE_ORDER == __BIG_ENDIAN
 945                 encoding = unicode::EUTFE_UTF16_BE;
 946 #else
 947                 encoding = unicode::EUTFE_UTF16_LE;
 948 #endif
 949
 950                 append(c, length);
 951         }
 952
 953
 954         //! Constructor for copying a UTF-16 string from a pointer.
 955         ustring16(const uchar16_t* const c)
 956         : array(0), allocated(0), used(0)
 957         {
 958 #if __BYTE_ORDER == __BIG_ENDIAN
 959                 encoding = unicode::EUTFE_UTF16_BE;
 960 #else
 961                 encoding = unicode::EUTFE_UTF16_LE;
 962 #endif
 963
 964                 append(c);
 965         }
 966
 967
 968         //! Constructor for copying a UTF-16 string from a pointer with a given length
 969         ustring16(const uchar16_t* const c, u32 length)
 970         : array(0), allocated(0), used(0)
 971         {
 972 #if __BYTE_ORDER == __BIG_ENDIAN
 973                 encoding = unicode::EUTFE_UTF16_BE;
 974 #else
 975                 encoding = unicode::EUTFE_UTF16_LE;
 976 #endif
 977
 978                 append(c, length);
 979         }
 980
 981
 982         //! Constructor for copying a UTF-32 string from a pointer.
 983         ustring16(const uchar32_t* const c)
 984         : array(0), allocated(0), used(0)
 985         {
 986 #if __BYTE_ORDER == __BIG_ENDIAN
 987                 encoding = unicode::EUTFE_UTF16_BE;
 988 #else
 989                 encoding = unicode::EUTFE_UTF16_LE;
 990 #endif
 991
 992                 append(c);
 993         }
 994
 995
 996         //! Constructor for copying a UTF-32 from a pointer with a given length.
 997         ustring16(const uchar32_t* const c, u32 length)
 998         : array(0), allocated(0), used(0)
 999         {
1000 #if __BYTE_ORDER == __BIG_ENDIAN
1001                 encoding = unicode::EUTFE_UTF16_BE;
1002 #else
1003                 encoding = unicode::EUTFE_UTF16_LE;
1004 #endif
1005
1006                 append(c, length);
1007         }
1008
1009
1010         //! Constructor for copying a wchar_t string from a pointer.
1011         ustring16(const wchar_t* const c)
1012         : array(0), allocated(0), used(0)
1013         {
1014 #if __BYTE_ORDER == __BIG_ENDIAN
1015                 encoding = unicode::EUTFE_UTF16_BE;
1016 #else
1017                 encoding = unicode::EUTFE_UTF16_LE;
1018 #endif
1019
1020                 if (sizeof(wchar_t) == 4)
1021                         append(reinterpret_cast<const uchar32_t* const>(c));
1022                 else if (sizeof(wchar_t) == 2)
1023                         append(reinterpret_cast<const uchar16_t* const>(c));
1024                 else if (sizeof(wchar_t) == 1)
1025                         append(reinterpret_cast<const uchar8_t* const>(c));
1026         }
1027
1028
1029         //! Constructor for copying a wchar_t string from a pointer with a given length.
1030         ustring16(const wchar_t* const c, u32 length)
1031         : array(0), allocated(0), used(0)
1032         {
1033 #if __BYTE_ORDER == __BIG_ENDIAN
1034                 encoding = unicode::EUTFE_UTF16_BE;
1035 #else
1036                 encoding = unicode::EUTFE_UTF16_LE;
1037 #endif
1038
1039                 if (sizeof(wchar_t) == 4)
1040                         append(reinterpret_cast<const uchar32_t* const>(c), length);
1041                 else if (sizeof(wchar_t) == 2)
1042                         append(reinterpret_cast<const uchar16_t* const>(c), length);
1043                 else if (sizeof(wchar_t) == 1)
1044                         append(reinterpret_cast<const uchar8_t* const>(c), length);
1045         }
1046
1047
1048 #ifdef USTRING_CPP0X
1049         //! Constructor for moving a ustring16
1050         ustring16(ustring16<TAlloc>&& other)
1051         : array(other.array), encoding(other.encoding), allocated(other.allocated), used(other.used)
1052         {
1053                 //std::cout << "MOVE constructor" << std::endl;
1054                 other.array = 0;
1055                 other.allocated = 0;
1056                 other.used = 0;
1057         }
1058 #endif
1059
1060
1061         //! Destructor
1062         ~ustring16()
1063         {
1064                 allocator.deallocate(array); // delete [] array;
1065         }
1066
1067
1068         //! Assignment operator
1069         ustring16& operator=(const ustring16<TAlloc>& other)
1070         {
1071                 if (this == &other)
1072                         return *this;
1073
1074                 used = other.size_raw();
1075                 if (used >= allocated)
1076                 {
1077                         allocator.deallocate(array); // delete [] array;
1078                         allocated = used + 1;
1079                         array = allocator.allocate(used + 1); //new u16[used];
1080                 }
1081
1082                 const uchar16_t* p = other.c_str();
1083                 for (u32 i=0; i<=used; ++i, ++p)
1084                         array[i] = *p;
1085
1086                 array[used] = 0;
1087
1088                 // Validate our new UTF-16 string.
1089                 validate();
1090
1091                 return *this;
1092         }
1093
1094
1095 #ifdef USTRING_CPP0X
1096         //! Move assignment operator
1097         ustring16& operator=(ustring16<TAlloc>&& other)
1098         {
1099                 if (this != &other)
1100                 {
1101                         //std::cout << "MOVE operator=" << std::endl;
1102                         allocator.deallocate(array);
1103
1104                         array = other.array;
1105                         allocated = other.allocated;
1106                         encoding = other.encoding;
1107                         used = other.used;
1108                         other.array = 0;
1109                         other.used = 0;
1110                 }
1111                 return *this;
1112         }
1113 #endif
1114
1115
1116         //! Assignment operator for other string types
1117         template <class B, class A>
1118         ustring16<TAlloc>& operator=(const string<B, A>& other)
1119         {
1120                 *this = other.c_str();
1121                 return *this;
1122         }
1123
1124
1125         //! Assignment operator for UTF-8 strings
1126         ustring16<TAlloc>& operator=(const uchar8_t* const c)
1127         {
1128                 if (!array)
1129                 {
1130                         array = allocator.allocate(1); //new u16[1];
1131                         allocated = 1;
1132                 }
1133                 used = 0;
1134                 array[used] = 0x0;
1135                 if (!c) return *this;
1136
1137                 //! Append our string now.
1138                 append(c);
1139                 return *this;
1140         }
1141
1142
1143         //! Assignment operator for UTF-16 strings
1144         ustring16<TAlloc>& operator=(const uchar16_t* const c)
1145         {
1146                 if (!array)
1147                 {
1148                         array = allocator.allocate(1); //new u16[1];
1149                         allocated = 1;
1150                 }
1151                 used = 0;
1152                 array[used] = 0x0;
1153                 if (!c) return *this;
1154
1155                 //! Append our string now.
1156                 append(c);
1157                 return *this;
1158         }
1159
1160
1161         //! Assignment operator for UTF-32 strings
1162         ustring16<TAlloc>& operator=(const uchar32_t* const c)
1163         {
1164                 if (!array)
1165                 {
1166                         array = allocator.allocate(1); //new u16[1];
1167                         allocated = 1;
1168                 }
1169                 used = 0;
1170                 array[used] = 0x0;
1171                 if (!c) return *this;
1172
1173                 //! Append our string now.
1174                 append(c);
1175                 return *this;
1176         }
1177
1178
1179         //! Assignment operator for wchar_t strings.
1180         /** Note that this assumes that a correct unicode string is stored in the wchar_t string.
1181                 Since wchar_t changes depending on its platform, it could either be a UTF-8, -16, or -32 string.
1182                 This function assumes you are storing the correct unicode encoding inside the wchar_t string. **/
1183         ustring16<TAlloc>& operator=(const wchar_t* const c)
1184         {
1185                 if (sizeof(wchar_t) == 4)
1186                         *this = reinterpret_cast<const uchar32_t* const>(c);
1187                 else if (sizeof(wchar_t) == 2)
1188                         *this = reinterpret_cast<const uchar16_t* const>(c);
1189                 else if (sizeof(wchar_t) == 1)
1190                         *this = reinterpret_cast<const uchar8_t* const>(c);
1191
1192                 return *this;
1193         }
1194
1195
1196         //! Assignment operator for other strings.
1197         /** Note that this assumes that a correct unicode string is stored in the string. **/
1198         template <class B>
1199         ustring16<TAlloc>& operator=(const B* const c)
1200         {
1201                 if (sizeof(B) == 4)
1202                         *this = reinterpret_cast<const uchar32_t* const>(c);
1203                 else if (sizeof(B) == 2)
1204                         *this = reinterpret_cast<const uchar16_t* const>(c);
1205                 else if (sizeof(B) == 1)
1206                         *this = reinterpret_cast<const uchar8_t* const>(c);
1207
1208                 return *this;
1209         }
1210
1211
1212         //! Direct access operator
1213         access operator [](const u32 index)
1214         {
1215                 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1216                 iterator iter(*this, index);
1217                 return iter.operator*();
1218         }
1219
1220
1221         //! Direct access operator
1222         const access operator [](const u32 index) const
1223         {
1224                 _IRR_DEBUG_BREAK_IF(index>=size()) // bad index
1225                 const_iterator iter(*this, index);
1226                 return iter.operator*();
1227         }
1228
1229
1230         //! Equality operator
1231         bool operator ==(const uchar16_t* const str) const
1232         {
1233                 if (!str)
1234                         return false;
1235
1236                 u32 i;
1237                 for(i=0; array[i] && str[i]; ++i)
1238                         if (array[i] != str[i])
1239                                 return false;
1240
1241                 return !array[i] && !str[i];
1242         }
1243
1244
1245         //! Equality operator
1246         bool operator ==(const ustring16<TAlloc>& other) const
1247         {
1248                 for(u32 i=0; array[i] && other.array[i]; ++i)
1249                         if (array[i] != other.array[i])
1250                                 return false;
1251
1252                 return used == other.used;
1253         }
1254
1255
1256         //! Is smaller comparator
1257         bool operator <(const ustring16<TAlloc>& other) const
1258         {
1259                 for(u32 i=0; array[i] && other.array[i]; ++i)
1260                 {
1261                         s32 diff = array[i] - other.array[i];
1262                         if ( diff )
1263                                 return diff < 0;
1264                 }
1265
1266                 return used < other.used;
1267         }
1268
1269
1270         //! Inequality operator
1271         bool operator !=(const uchar16_t* const str) const
1272         {
1273                 return !(*this == str);
1274         }
1275
1276
1277         //! Inequality operator
1278         bool operator !=(const ustring16<TAlloc>& other) const
1279         {
1280                 return !(*this == other);
1281         }
1282
1283
1284         //! Returns the length of a ustring16 in full characters.
1285         //! \return Length of a ustring16 in full characters.
1286         u32 size() const
1287         {
1288                 const_iterator i(*this, 0);
1289                 u32 pos = 0;
1290                 while (!i.atEnd())
1291                 {
1292                         ++i;
1293                         ++pos;
1294                 }
1295                 return pos;
1296         }
1297
1298
1299         //! Informs if the ustring is empty or not.
1300         //! \return True if the ustring is empty, false if not.
1301         bool empty() const
1302         {
1303                 return (size_raw() == 0);
1304         }
1305
1306
1307         //! Returns a pointer to the raw UTF-16 string data.
1308         //! \return pointer to C-style NUL terminated array of UTF-16 code points.
1309         const uchar16_t* c_str() const
1310         {
1311                 return array;
1312         }
1313
1314
1315         //! Compares the first n characters of this string with another.
1316         //! \param other Other string to compare to.
1317         //! \param n Number of characters to compare.
1318         //! \return True if the n first characters of both strings are equal.
1319         bool equalsn(const ustring16<TAlloc>& other, u32 n) const
1320         {
1321                 u32 i;
1322                 const uchar16_t* oa = other.c_str();
1323                 for(i=0; array[i] && oa[i] && i < n; ++i)
1324                         if (array[i] != oa[i])
1325                                 return false;
1326
1327                 // if one (or both) of the strings was smaller then they
1328                 // are only equal if they have the same length
1329                 return (i == n) || (used == other.used);
1330         }
1331
1332
1333         //! Compares the first n characters of this string with another.
1334         //! \param str Other string to compare to.
1335         //! \param n Number of characters to compare.
1336         //! \return True if the n first characters of both strings are equal.
1337         bool equalsn(const uchar16_t* const str, u32 n) const
1338         {
1339                 if (!str)
1340                         return false;
1341                 u32 i;
1342                 for(i=0; array[i] && str[i] && i < n; ++i)
1343                         if (array[i] != str[i])
1344                                 return false;
1345
1346                 // if one (or both) of the strings was smaller then they
1347                 // are only equal if they have the same length
1348                 return (i == n) || (array[i] == 0 && str[i] == 0);
1349         }
1350
1351
1352         //! Appends a character to this ustring16
1353         //! \param character The character to append.
1354         //! \return A reference to our current string.
1355         ustring16<TAlloc>& append(uchar32_t character)
1356         {
1357                 if (used + 2 >= allocated)
1358                         reallocate(used + 2);
1359
1360                 if (character > 0xFFFF)
1361                 {
1362                         used += 2;
1363
1364                         // character will be multibyte, so split it up into a surrogate pair.
1365                         uchar16_t x = static_cast<uchar16_t>(character);
1366                         uchar16_t vh = UTF16_HI_SURROGATE | ((((character >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1367                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1368                         array[used-2] = vh;
1369                         array[used-1] = vl;
1370                 }
1371                 else
1372                 {
1373                         ++used;
1374                         array[used-1] = character;
1375                 }
1376                 array[used] = 0;
1377
1378                 return *this;
1379         }
1380
1381
1382         //! Appends a UTF-8 string to this ustring16
1383         //! \param other The UTF-8 string to append.
1384         //! \param length The length of the string to append.
1385         //! \return A reference to our current string.
1386         ustring16<TAlloc>& append(const uchar8_t* const other, u32 length=0xffffffff)
1387         {
1388                 if (!other)
1389                         return *this;
1390
1391                 // Determine if the string is long enough for a BOM.
1392                 u32 len = 0;
1393                 const uchar8_t* p = other;
1394                 do
1395                 {
1396                         ++len;
1397                 } while (*p++ && len < unicode::BOM_ENCODE_UTF8_LEN);
1398
1399                 // Check for BOM.
1400                 unicode::EUTF_ENCODE c_bom = unicode::EUTFE_NONE;
1401                 if (len == unicode::BOM_ENCODE_UTF8_LEN)
1402                 {
1403                         if (memcmp(other, unicode::BOM_ENCODE_UTF8, unicode::BOM_ENCODE_UTF8_LEN) == 0)
1404                                 c_bom = unicode::EUTFE_UTF8;
1405                 }
1406
1407                 // If a BOM was found, don't include it in the string.
1408                 const uchar8_t* c2 = other;
1409                 if (c_bom != unicode::EUTFE_NONE)
1410                 {
1411                         c2 = other + unicode::BOM_UTF8_LEN;
1412                         length -= unicode::BOM_UTF8_LEN;
1413                 }
1414
1415                 // Calculate the size of the string to read in.
1416                 len = 0;
1417                 p = c2;
1418                 do
1419                 {
1420                         ++len;
1421                 } while(*p++ && len < length);
1422                 if (len > length)
1423                         len = length;
1424
1425                 // If we need to grow the array, do it now.
1426                 if (used + len >= allocated)
1427                         reallocate(used + (len * 2));
1428                 u32 start = used;
1429
1430                 // Convert UTF-8 to UTF-16.
1431                 u32 pos = start;
1432                 for (u32 l = 0; l<len;)
1433                 {
1434                         ++used;
1435                         if (((c2[l] >> 6) & 0x03) == 0x02)
1436                         {       // Invalid continuation byte.
1437                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1438                                 ++l;
1439                         }
1440                         else if (c2[l] == 0xC0 || c2[l] == 0xC1)
1441                         {       // Invalid byte - overlong encoding.
1442                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1443                                 ++l;
1444                         }
1445                         else if ((c2[l] & 0xF8) == 0xF0)
1446                         {       // 4 bytes UTF-8, 2 bytes UTF-16.
1447                                 // Check for a full string.
1448                                 if ((l + 3) >= len)
1449                                 {
1450                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1451                                         l += 3;
1452                                         break;
1453                                 }
1454
1455                                 // Validate.
1456                                 bool valid = true;
1457                                 u8 l2 = 0;
1458                                 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1459                                 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1460                                 if (valid && (((c2[l+3] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1461                                 if (!valid)
1462                                 {
1463                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1464                                         l += l2;
1465                                         continue;
1466                                 }
1467
1468                                 // Decode.
1469                                 uchar8_t b1 = ((c2[l] & 0x7) << 2) | ((c2[l+1] >> 4) & 0x3);
1470                                 uchar8_t b2 = ((c2[l+1] & 0xF) << 4) | ((c2[l+2] >> 2) & 0xF);
1471                                 uchar8_t b3 = ((c2[l+2] & 0x3) << 6) | (c2[l+3] & 0x3F);
1472                                 uchar32_t v = b3 | ((uchar32_t)b2 << 8) | ((uchar32_t)b1 << 16);
1473
1474                                 // Split v up into a surrogate pair.
1475                                 uchar16_t x = static_cast<uchar16_t>(v);
1476                                 uchar16_t vh = UTF16_HI_SURROGATE | ((((v >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1477                                 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1478
1479                                 array[pos++] = vh;
1480                                 array[pos++] = vl;
1481                                 l += 4;
1482                                 ++used;         // Using two shorts this time, so increase used by 1.
1483                         }
1484                         else if ((c2[l] & 0xF0) == 0xE0)
1485                         {       // 3 bytes UTF-8, 1 byte UTF-16.
1486                                 // Check for a full string.
1487                                 if ((l + 2) >= len)
1488                                 {
1489                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1490                                         l += 2;
1491                                         break;
1492                                 }
1493
1494                                 // Validate.
1495                                 bool valid = true;
1496                                 u8 l2 = 0;
1497                                 if (valid && (((c2[l+1] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1498                                 if (valid && (((c2[l+2] >> 6) & 0x03) == 0x02)) ++l2; else valid = false;
1499                                 if (!valid)
1500                                 {
1501                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1502                                         l += l2;
1503                                         continue;
1504                                 }
1505
1506                                 // Decode.
1507                                 uchar8_t b1 = ((c2[l] & 0xF) << 4) | ((c2[l+1] >> 2) & 0xF);
1508                                 uchar8_t b2 = ((c2[l+1] & 0x3) << 6) | (c2[l+2] & 0x3F);
1509                                 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1510                                 array[pos++] = ch;
1511                                 l += 3;
1512                         }
1513                         else if ((c2[l] & 0xE0) == 0xC0)
1514                         {       // 2 bytes UTF-8, 1 byte UTF-16.
1515                                 // Check for a full string.
1516                                 if ((l + 1) >= len)
1517                                 {
1518                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1519                                         l += 1;
1520                                         break;
1521                                 }
1522
1523                                 // Validate.
1524                                 if (((c2[l+1] >> 6) & 0x03) != 0x02)
1525                                 {
1526                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1527                                         ++l;
1528                                         continue;
1529                                 }
1530
1531                                 // Decode.
1532                                 uchar8_t b1 = (c2[l] >> 2) & 0x7;
1533                                 uchar8_t b2 = ((c2[l] & 0x3) << 6) | (c2[l+1] & 0x3F);
1534                                 uchar16_t ch = b2 | ((uchar16_t)b1 << 8);
1535                                 array[pos++] = ch;
1536                                 l += 2;
1537                         }
1538                         else
1539                         {       // 1 byte UTF-8, 1 byte UTF-16.
1540                                 // Validate.
1541                                 if (c2[l] > 0x7F)
1542                                 {       // Values above 0xF4 are restricted and aren't used.  By now, anything above 0x7F is invalid.
1543                                         array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1544                                 }
1545                                 else array[pos++] = static_cast<uchar16_t>(c2[l]);
1546                                 ++l;
1547                         }
1548                 }
1549                 array[used] = 0;
1550
1551                 // Validate our new UTF-16 string.
1552                 validate();
1553
1554                 return *this;
1555         }
1556
1557
1558         //! Appends a UTF-16 string to this ustring16
1559         //! \param other The UTF-16 string to append.
1560         //! \param length The length of the string to append.
1561         //! \return A reference to our current string.
1562         ustring16<TAlloc>& append(const uchar16_t* const other, u32 length=0xffffffff)
1563         {
1564                 if (!other)
1565                         return *this;
1566
1567                 // Determine if the string is long enough for a BOM.
1568                 u32 len = 0;
1569                 const uchar16_t* p = other;
1570                 do
1571                 {
1572                         ++len;
1573                 } while (*p++ && len < unicode::BOM_ENCODE_UTF16_LEN);
1574
1575                 // Check for the BOM to determine the string's endianness.
1576                 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1577                 if (memcmp(other, unicode::BOM_ENCODE_UTF16_LE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1578                         c_end = unicode::EUTFEE_LITTLE;
1579                 else if (memcmp(other, unicode::BOM_ENCODE_UTF16_BE, unicode::BOM_ENCODE_UTF16_LEN) == 0)
1580                         c_end = unicode::EUTFEE_BIG;
1581
1582                 // If a BOM was found, don't include it in the string.
1583                 const uchar16_t* c2 = other;
1584                 if (c_end != unicode::EUTFEE_NATIVE)
1585                 {
1586                         c2 = other + unicode::BOM_UTF16_LEN;
1587                         length -= unicode::BOM_UTF16_LEN;
1588                 }
1589
1590                 // Calculate the size of the string to read in.
1591                 len = 0;
1592                 p = c2;
1593                 do
1594                 {
1595                         ++len;
1596                 } while(*p++ && len < length);
1597                 if (len > length)
1598                         len = length;
1599
1600                 // If we need to grow the size of the array, do it now.
1601                 if (used + len >= allocated)
1602                         reallocate(used + (len * 2));
1603                 u32 start = used;
1604                 used += len;
1605
1606                 // Copy the string now.
1607                 unicode::EUTF_ENDIAN m_end = getEndianness();
1608                 for (u32 l = start; l < start + len; ++l)
1609                 {
1610                         array[l] = (uchar16_t)c2[l];
1611                         if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1612                                 array[l] = unicode::swapEndian16(array[l]);
1613                 }
1614
1615                 array[used] = 0;
1616
1617                 // Validate our new UTF-16 string.
1618                 validate();
1619                 return *this;
1620         }
1621
1622
1623         //! Appends a UTF-32 string to this ustring16
1624         //! \param other The UTF-32 string to append.
1625         //! \param length The length of the string to append.
1626         //! \return A reference to our current string.
1627         ustring16<TAlloc>& append(const uchar32_t* const other, u32 length=0xffffffff)
1628         {
1629                 if (!other)
1630                         return *this;
1631
1632                 // Check for the BOM to determine the string's endianness.
1633                 unicode::EUTF_ENDIAN c_end = unicode::EUTFEE_NATIVE;
1634                 if (memcmp(other, unicode::BOM_ENCODE_UTF32_LE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1635                         c_end = unicode::EUTFEE_LITTLE;
1636                 else if (memcmp(other, unicode::BOM_ENCODE_UTF32_BE, unicode::BOM_ENCODE_UTF32_LEN) == 0)
1637                         c_end = unicode::EUTFEE_BIG;
1638
1639                 // If a BOM was found, don't include it in the string.
1640                 const uchar32_t* c2 = other;
1641                 if (c_end != unicode::EUTFEE_NATIVE)
1642                 {
1643                         c2 = other + unicode::BOM_UTF32_LEN;
1644                         length -= unicode::BOM_UTF32_LEN;
1645                 }
1646
1647                 // Calculate the size of the string to read in.
1648                 u32 len = 0;
1649                 const uchar32_t* p = c2;
1650                 do
1651                 {
1652                         ++len;
1653                 } while(*p++ && len < length);
1654                 if (len > length)
1655                         len = length;
1656
1657                 // If we need to grow the size of the array, do it now.
1658                 // In case all of the UTF-32 string is split into surrogate pairs, do len * 2.
1659                 if (used + (len * 2) >= allocated)
1660                         reallocate(used + ((len * 2) * 2));
1661                 u32 start = used;
1662
1663                 // Convert UTF-32 to UTF-16.
1664                 unicode::EUTF_ENDIAN m_end = getEndianness();
1665                 u32 pos = start;
1666                 for (u32 l = 0; l<len; ++l)
1667                 {
1668                         ++used;
1669
1670                         uchar32_t ch = c2[l];
1671                         if (c_end != unicode::EUTFEE_NATIVE && c_end != m_end)
1672                                 ch = unicode::swapEndian32(ch);
1673
1674                         if (ch > 0xFFFF)
1675                         {
1676                                 // Split ch up into a surrogate pair as it is over 16 bits long.
1677                                 uchar16_t x = static_cast<uchar16_t>(ch);
1678                                 uchar16_t vh = UTF16_HI_SURROGATE | ((((ch >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
1679                                 uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
1680                                 array[pos++] = vh;
1681                                 array[pos++] = vl;
1682                                 ++used;         // Using two shorts, so increased used again.
1683                         }
1684                         else if (ch >= 0xD800 && ch <= 0xDFFF)
1685                         {
1686                                 // Between possible UTF-16 surrogates (invalid!)
1687                                 array[pos++] = unicode::UTF_REPLACEMENT_CHARACTER;
1688                         }
1689                         else array[pos++] = static_cast<uchar16_t>(ch);
1690                 }
1691                 array[used] = 0;
1692
1693                 // Validate our new UTF-16 string.
1694                 validate();
1695
1696                 return *this;
1697         }
1698
1699
1700         //! Appends a ustring16 to this ustring16
1701         //! \param other The string to append to this one.
1702         //! \return A reference to our current string.
1703         ustring16<TAlloc>& append(const ustring16<TAlloc>& other)
1704         {
1705                 const uchar16_t* oa = other.c_str();
1706
1707                 u32 len = other.size_raw();
1708
1709                 if (used + len >= allocated)
1710                         reallocate(used + len);
1711
1712                 for (u32 l=0; l<len; ++l)
1713                         array[used+l] = oa[l];
1714
1715                 used += len;
1716                 array[used] = 0;
1717
1718                 return *this;
1719         }
1720
1721
1722         //! Appends a certain amount of characters of a ustring16 to this ustring16.
1723         //! \param other The string to append to this one.
1724         //! \param length How many characters of the other string to add to this one.
1725         //! \return A reference to our current string.
1726         ustring16<TAlloc>& append(const ustring16<TAlloc>& other, u32 length)
1727         {
1728                 if (other.size() == 0)
1729                         return *this;
1730
1731                 if (other.size() < length)
1732                 {
1733                         append(other);
1734                         return *this;
1735                 }
1736
1737                 if (used + length * 2 >= allocated)
1738                         reallocate(used + length * 2);
1739
1740                 const_iterator iter(other, 0);
1741                 u32 l = length;
1742                 while (!iter.atEnd() && l)
1743                 {
1744                         uchar32_t c = *iter;
1745                         append(c);
1746                         ++iter;
1747                         --l;
1748                 }
1749
1750                 return *this;
1751         }
1752
1753
1754         //! Reserves some memory.
1755         //! \param count The amount of characters to reserve.
1756         void reserve(u32 count)
1757         {
1758                 if (count < allocated)
1759                         return;
1760
1761                 reallocate(count);
1762         }
1763
1764
1765         //! Finds first occurrence of character.
1766         //! \param c The character to search for.
1767         //! \return Position where the character has been found, or -1 if not found.
1768         s32 findFirst(uchar32_t c) const
1769         {
1770                 const_iterator i(*this, 0);
1771
1772                 s32 pos = 0;
1773                 while (!i.atEnd())
1774                 {
1775                         uchar32_t t = *i;
1776                         if (c == t)
1777                                 return pos;
1778                         ++pos;
1779                         ++i;
1780                 }
1781
1782                 return -1;
1783         }
1784
1785         //! Finds first occurrence of a character of a list.
1786         //! \param c A list of characters to find. For example if the method should find the first occurrence of 'a' or 'b', this parameter should be "ab".
1787         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1788         //! \return Position where one of the characters has been found, or -1 if not found.
1789         s32 findFirstChar(const uchar32_t* const c, u32 count=1) const
1790         {
1791                 if (!c || !count)
1792                         return -1;
1793
1794                 const_iterator i(*this, 0);
1795
1796                 s32 pos = 0;
1797                 while (!i.atEnd())
1798                 {
1799                         uchar32_t t = *i;
1800                         for (u32 j=0; j<count; ++j)
1801                                 if (t == c[j])
1802                                         return pos;
1803                         ++pos;
1804                         ++i;
1805                 }
1806
1807                 return -1;
1808         }
1809
1810
1811         //! Finds first position of a character not in a given list.
1812         //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1813         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1814         //! \return Position where the character has been found, or -1 if not found.
1815         s32 findFirstCharNotInList(const uchar32_t* const c, u32 count=1) const
1816         {
1817                 if (!c || !count)
1818                         return -1;
1819
1820                 const_iterator i(*this, 0);
1821
1822                 s32 pos = 0;
1823                 while (!i.atEnd())
1824                 {
1825                         uchar32_t t = *i;
1826                         u32 j;
1827                         for (j=0; j<count; ++j)
1828                                 if (t == c[j])
1829                                         break;
1830
1831                         if (j==count)
1832                                 return pos;
1833                         ++pos;
1834                         ++i;
1835                 }
1836
1837                 return -1;
1838         }
1839
1840         //! Finds last position of a character not in a given list.
1841         //! \param c A list of characters to NOT find. For example if the method should find the first occurrence of a character not 'a' or 'b', this parameter should be "ab".
1842         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1843         //! \return Position where the character has been found, or -1 if not found.
1844         s32 findLastCharNotInList(const uchar32_t* const c, u32 count=1) const
1845         {
1846                 if (!c || !count)
1847                         return -1;
1848
1849                 const_iterator i(end());
1850                 --i;
1851
1852                 s32 pos = size() - 1;
1853                 while (!i.atStart())
1854                 {
1855                         uchar32_t t = *i;
1856                         u32 j;
1857                         for (j=0; j<count; ++j)
1858                                 if (t == c[j])
1859                                         break;
1860
1861                         if (j==count)
1862                                 return pos;
1863                         --pos;
1864                         --i;
1865                 }
1866
1867                 return -1;
1868         }
1869
1870         //! Finds next occurrence of character.
1871         //! \param c The character to search for.
1872         //! \param startPos The position in the string to start searching.
1873         //! \return Position where the character has been found, or -1 if not found.
1874         s32 findNext(uchar32_t c, u32 startPos) const
1875         {
1876                 const_iterator i(*this, startPos);
1877
1878                 s32 pos = startPos;
1879                 while (!i.atEnd())
1880                 {
1881                         uchar32_t t = *i;
1882                         if (t == c)
1883                                 return pos;
1884                         ++pos;
1885                         ++i;
1886                 }
1887
1888                 return -1;
1889         }
1890
1891
1892         //! Finds last occurrence of character.
1893         //! \param c The character to search for.
1894         //! \param start The start position of the reverse search ( default = -1, on end ).
1895         //! \return Position where the character has been found, or -1 if not found.
1896         s32 findLast(uchar32_t c, s32 start = -1) const
1897         {
1898                 u32 s = size();
1899                 start = core::clamp ( start < 0 ? (s32)s : start, 0, (s32)s ) - 1;
1900
1901                 const_iterator i(*this, start);
1902                 u32 pos = start;
1903                 while (!i.atStart())
1904                 {
1905                         uchar32_t t = *i;
1906                         if (t == c)
1907                                 return pos;
1908                         --pos;
1909                         --i;
1910                 }
1911
1912                 return -1;
1913         }
1914
1915         //! Finds last occurrence of a character in a list.
1916         //! \param c A list of strings to find. For example if the method should find the last occurrence of 'a' or 'b', this parameter should be "ab".
1917         //! \param count The amount of characters in the list. Usually, this should be strlen(c).
1918         //! \return Position where one of the characters has been found, or -1 if not found.
1919         s32 findLastChar(const uchar32_t* const c, u32 count=1) const
1920         {
1921                 if (!c || !count)
1922                         return -1;
1923
1924                 const_iterator i(end());
1925                 --i;
1926
1927                 s32 pos = size();
1928                 while (!i.atStart())
1929                 {
1930                         uchar32_t t = *i;
1931                         for (u32 j=0; j<count; ++j)
1932                                 if (t == c[j])
1933                                         return pos;
1934                         --pos;
1935                         --i;
1936                 }
1937
1938                 return -1;
1939         }
1940
1941
1942         //! Finds another ustring16 in this ustring16.
1943         //! \param str The string to find.
1944         //! \param start The start position of the search.
1945         //! \return Positions where the ustring16 has been found, or -1 if not found.
1946         s32 find(const ustring16<TAlloc>& str, const u32 start = 0) const
1947         {
1948                 u32 my_size = size();
1949                 u32 their_size = str.size();
1950
1951                 if (their_size == 0 || my_size - start < their_size)
1952                         return -1;
1953
1954                 const_iterator i(*this, start);
1955
1956                 s32 pos = start;
1957                 while (!i.atEnd())
1958                 {
1959                         const_iterator i2(i);
1960                         const_iterator j(str, 0);
1961                         uchar32_t t1 = (uchar32_t)*i2;
1962                         uchar32_t t2 = (uchar32_t)*j;
1963                         while (t1 == t2)
1964                         {
1965                                 ++i2;
1966                                 ++j;
1967                                 if (j.atEnd())
1968                                         return pos;
1969                                 t1 = (uchar32_t)*i2;
1970                                 t2 = (uchar32_t)*j;
1971                         }
1972                         ++i;
1973                         ++pos;
1974                 }
1975
1976                 return -1;
1977         }
1978
1979
1980         //! Finds another ustring16 in this ustring16.
1981         //! \param str The string to find.
1982         //! \param start The start position of the search.
1983         //! \return Positions where the string has been found, or -1 if not found.
1984         s32 find_raw(const ustring16<TAlloc>& str, const u32 start = 0) const
1985         {
1986                 const uchar16_t* data = str.c_str();
1987                 if (data && *data)
1988                 {
1989                         u32 len = 0;
1990
1991                         while (data[len])
1992                                 ++len;
1993
1994                         if (len > used)
1995                                 return -1;
1996
1997                         for (u32 i=start; i<=used-len; ++i)
1998                         {
1999                                 u32 j=0;
2000
2001                                 while(data[j] && array[i+j] == data[j])
2002                                         ++j;
2003
2004                                 if (!data[j])
2005                                         return i;
2006                         }
2007                 }
2008
2009                 return -1;
2010         }
2011
2012
2013         //! Returns a substring.
2014         //! \param begin: Start of substring.
2015         //! \param length: Length of substring.
2016         //! \return A reference to our current string.
2017         ustring16<TAlloc> subString(u32 begin, s32 length) const
2018         {
2019                 u32 len = size();
2020                 // if start after ustring16
2021                 // or no proper substring length
2022                 if ((length <= 0) || (begin>=len))
2023                         return ustring16<TAlloc>("");
2024                 // clamp length to maximal value
2025                 if ((length+begin) > len)
2026                         length = len-begin;
2027
2028                 ustring16<TAlloc> o;
2029                 o.reserve((length+1) * 2);
2030
2031                 const_iterator i(*this, begin);
2032                 while (!i.atEnd() && length)
2033                 {
2034                         o.append(*i);
2035                         ++i;
2036                         --length;
2037                 }
2038
2039                 return o;
2040         }
2041
2042
2043         //! Appends a character to this ustring16.
2044         //! \param c Character to append.
2045         //! \return A reference to our current string.
2046         ustring16<TAlloc>& operator += (char c)
2047         {
2048                 append((uchar32_t)c);
2049                 return *this;
2050         }
2051
2052
2053         //! Appends a character to this ustring16.
2054         //! \param c Character to append.
2055         //! \return A reference to our current string.
2056         ustring16<TAlloc>& operator += (uchar32_t c)
2057         {
2058                 append(c);
2059                 return *this;
2060         }
2061
2062
2063         //! Appends a number to this ustring16.
2064         //! \param c Number to append.
2065         //! \return A reference to our current string.
2066         ustring16<TAlloc>& operator += (short c)
2067         {
2068                 append(core::stringc(c));
2069                 return *this;
2070         }
2071
2072
2073         //! Appends a number to this ustring16.
2074         //! \param c Number to append.
2075         //! \return A reference to our current string.
2076         ustring16<TAlloc>& operator += (unsigned short c)
2077         {
2078                 append(core::stringc(c));
2079                 return *this;
2080         }
2081
2082
2083 #ifdef USTRING_CPP0X_NEWLITERALS
2084         //! Appends a number to this ustring16.
2085         //! \param c Number to append.
2086         //! \return A reference to our current string.
2087         ustring16<TAlloc>& operator += (int c)
2088         {
2089                 append(core::stringc(c));
2090                 return *this;
2091         }
2092
2093
2094         //! Appends a number to this ustring16.
2095         //! \param c Number to append.
2096         //! \return A reference to our current string.
2097         ustring16<TAlloc>& operator += (unsigned int c)
2098         {
2099                 append(core::stringc(c));
2100                 return *this;
2101         }
2102 #endif
2103
2104
2105         //! Appends a number to this ustring16.
2106         //! \param c Number to append.
2107         //! \return A reference to our current string.
2108         ustring16<TAlloc>& operator += (long c)
2109         {
2110                 append(core::stringc(c));
2111                 return *this;
2112         }
2113
2114
2115         //! Appends a number to this ustring16.
2116         //! \param c Number to append.
2117         //! \return A reference to our current string.
2118         ustring16<TAlloc>& operator += (unsigned long c)
2119         {
2120                 append(core::stringc(c));
2121                 return *this;
2122         }
2123
2124
2125         //! Appends a number to this ustring16.
2126         //! \param c Number to append.
2127         //! \return A reference to our current string.
2128         ustring16<TAlloc>& operator += (double c)
2129         {
2130                 append(core::stringc(c));
2131                 return *this;
2132         }
2133
2134
2135         //! Appends a char ustring16 to this ustring16.
2136         //! \param c Char ustring16 to append.
2137         //! \return A reference to our current string.
2138         ustring16<TAlloc>& operator += (const uchar16_t* const c)
2139         {
2140                 append(c);
2141                 return *this;
2142         }
2143
2144
2145         //! Appends a ustring16 to this ustring16.
2146         //! \param other ustring16 to append.
2147         //! \return A reference to our current string.
2148         ustring16<TAlloc>& operator += (const ustring16<TAlloc>& other)
2149         {
2150                 append(other);
2151                 return *this;
2152         }
2153
2154
2155         //! Replaces all characters of a given type with another one.
2156         //! \param toReplace Character to replace.
2157         //! \param replaceWith Character replacing the old one.
2158         //! \return A reference to our current string.
2159         ustring16<TAlloc>& replace(uchar32_t toReplace, uchar32_t replaceWith)
2160         {
2161                 iterator i(*this, 0);
2162                 while (!i.atEnd())
2163                 {
2164                         typename ustring16<TAlloc>::access a = *i;
2165                         if ((uchar32_t)a == toReplace)
2166                                 a = replaceWith;
2167                         ++i;
2168                 }
2169                 return *this;
2170         }
2171
2172
2173         //! Replaces all instances of a string with another one.
2174         //! \param toReplace The string to replace.
2175         //! \param replaceWith The string replacing the old one.
2176         //! \return A reference to our current string.
2177         ustring16<TAlloc>& replace(const ustring16<TAlloc>& toReplace, const ustring16<TAlloc>& replaceWith)
2178         {
2179                 if (toReplace.size() == 0)
2180                         return *this;
2181
2182                 const uchar16_t* other = toReplace.c_str();
2183                 const uchar16_t* replace = replaceWith.c_str();
2184                 const u32 other_size = toReplace.size_raw();
2185                 const u32 replace_size = replaceWith.size_raw();
2186
2187                 // Determine the delta.  The algorithm will change depending on the delta.
2188                 s32 delta = replace_size - other_size;
2189
2190                 // A character for character replace.  The string will not shrink or grow.
2191                 if (delta == 0)
2192                 {
2193                         s32 pos = 0;
2194                         while ((pos = find_raw(other, pos)) != -1)
2195                         {
2196                                 for (u32 i = 0; i < replace_size; ++i)
2197                                         array[pos + i] = replace[i];
2198                                 ++pos;
2199                         }
2200                         return *this;
2201                 }
2202
2203                 // We are going to be removing some characters.  The string will shrink.
2204                 if (delta < 0)
2205                 {
2206                         u32 i = 0;
2207                         for (u32 pos = 0; pos <= used; ++i, ++pos)
2208                         {
2209                                 // Is this potentially a match?
2210                                 if (array[pos] == *other)
2211                                 {
2212                                         // Check to see if we have a match.
2213                                         u32 j;
2214                                         for (j = 0; j < other_size; ++j)
2215                                         {
2216                                                 if (array[pos + j] != other[j])
2217                                                         break;
2218                                         }
2219
2220                                         // If we have a match, replace characters.
2221                                         if (j == other_size)
2222                                         {
2223                                                 for (j = 0; j < replace_size; ++j)
2224                                                         array[i + j] = replace[j];
2225                                                 i += replace_size - 1;
2226                                                 pos += other_size - 1;
2227                                                 continue;
2228                                         }
2229                                 }
2230
2231                                 // No match found, just copy characters.
2232                                 array[i - 1] = array[pos];
2233                         }
2234                         array[i] = 0;
2235                         used = i;
2236
2237                         return *this;
2238                 }
2239
2240                 // We are going to be adding characters, so the string size will increase.
2241                 // Count the number of times toReplace exists in the string so we can allocate the new size.
2242                 u32 find_count = 0;
2243                 s32 pos = 0;
2244                 while ((pos = find_raw(other, pos)) != -1)
2245                 {
2246                         ++find_count;
2247                         ++pos;
2248                 }
2249
2250                 // Re-allocate the string now, if needed.
2251                 u32 len = delta * find_count;
2252                 if (used + len >= allocated)
2253                         reallocate(used + len);
2254
2255                 // Start replacing.
2256                 pos = 0;
2257                 while ((pos = find_raw(other, pos)) != -1)
2258                 {
2259                         uchar16_t* start = array + pos + other_size - 1;
2260                         uchar16_t* ptr   = array + used;
2261                         uchar16_t* end   = array + used + delta;
2262
2263                         // Shift characters to make room for the string.
2264                         while (ptr != start)
2265                         {
2266                                 *end = *ptr;
2267                                 --ptr;
2268                                 --end;
2269                         }
2270
2271                         // Add the new string now.
2272                         for (u32 i = 0; i < replace_size; ++i)
2273                                 array[pos + i] = replace[i];
2274
2275                         pos += replace_size;
2276                         used += delta;
2277                 }
2278
2279                 // Terminate the string and return ourself.
2280                 array[used] = 0;
2281                 return *this;
2282         }
2283
2284
2285         //! Removes characters from a ustring16..
2286         //! \param c The character to remove.
2287         //! \return A reference to our current string.
2288         ustring16<TAlloc>& remove(uchar32_t c)
2289         {
2290                 u32 pos = 0;
2291                 u32 found = 0;
2292                 u32 len = (c > 0xFFFF ? 2 : 1);         // Remove characters equal to the size of c as a UTF-16 character.
2293                 for (u32 i=0; i<=used; ++i)
2294                 {
2295                         uchar32_t uc32 = 0;
2296                         if (!UTF16_IS_SURROGATE_HI(array[i]))
2297                                 uc32 |= array[i];
2298                         else if (i + 1 <= used)
2299                         {
2300                                 // Convert the surrogate pair into a single UTF-32 character.
2301                                 uc32 = unicode::toUTF32(array[i], array[i + 1]);
2302                         }
2303                         u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2304
2305                         if (uc32 == c)
2306                         {
2307                                 found += len;
2308                                 continue;
2309                         }
2310
2311                         array[pos++] = array[i];
2312                         if (len2 == 2)
2313                                 array[pos++] = array[++i];
2314                 }
2315                 used -= found;
2316                 array[used] = 0;
2317                 return *this;
2318         }
2319
2320
2321         //! Removes a ustring16 from the ustring16.
2322         //! \param toRemove The string to remove.
2323         //! \return A reference to our current string.
2324         ustring16<TAlloc>& remove(const ustring16<TAlloc>& toRemove)
2325         {
2326                 u32 size = toRemove.size_raw();
2327                 if (size == 0) return *this;
2328
2329                 const uchar16_t* tra = toRemove.c_str();
2330                 u32 pos = 0;
2331                 u32 found = 0;
2332                 for (u32 i=0; i<=used; ++i)
2333                 {
2334                         u32 j = 0;
2335                         while (j < size)
2336                         {
2337                                 if (array[i + j] != tra[j])
2338                                         break;
2339                                 ++j;
2340                         }
2341                         if (j == size)
2342                         {
2343                                 found += size;
2344                                 i += size - 1;
2345                                 continue;
2346                         }
2347
2348                         array[pos++] = array[i];
2349                 }
2350                 used -= found;
2351                 array[used] = 0;
2352                 return *this;
2353         }
2354
2355
2356         //! Removes characters from the ustring16.
2357         //! \param characters The characters to remove.
2358         //! \return A reference to our current string.
2359         ustring16<TAlloc>& removeChars(const ustring16<TAlloc>& characters)
2360         {
2361                 if (characters.size_raw() == 0)
2362                         return *this;
2363
2364                 u32 pos = 0;
2365                 u32 found = 0;
2366                 const_iterator iter(characters);
2367                 for (u32 i=0; i<=used; ++i)
2368                 {
2369                         uchar32_t uc32 = 0;
2370                         if (!UTF16_IS_SURROGATE_HI(array[i]))
2371                                 uc32 |= array[i];
2372                         else if (i + 1 <= used)
2373                         {
2374                                 // Convert the surrogate pair into a single UTF-32 character.
2375                                 uc32 = unicode::toUTF32(array[i], array[i+1]);
2376                         }
2377                         u32 len2 = (uc32 > 0xFFFF ? 2 : 1);
2378
2379                         bool cont = false;
2380                         iter.toStart();
2381                         while (!iter.atEnd())
2382                         {
2383                                 uchar32_t c = *iter;
2384                                 if (uc32 == c)
2385                                 {
2386                                         found += (c > 0xFFFF ? 2 : 1);          // Remove characters equal to the size of c as a UTF-16 character.
2387                                         ++i;
2388                                         cont = true;
2389                                         break;
2390                                 }
2391                                 ++iter;
2392                         }
2393                         if (cont) continue;
2394
2395                         array[pos++] = array[i];
2396                         if (len2 == 2)
2397                                 array[pos++] = array[++i];
2398                 }
2399                 used -= found;
2400                 array[used] = 0;
2401                 return *this;
2402         }
2403
2404
2405         //! Trims the ustring16.
2406         //! Removes the specified characters (by default, Latin-1 whitespace) from the begining and the end of the ustring16.
2407         //! \param whitespace The characters that are to be considered as whitespace.
2408         //! \return A reference to our current string.
2409         ustring16<TAlloc>& trim(const ustring16<TAlloc>& whitespace = " \t\n\r")
2410         {
2411                 core::array<uchar32_t> utf32white = whitespace.toUTF32();
2412
2413                 // find start and end of the substring without the specified characters
2414                 const s32 begin = findFirstCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2415                 if (begin == -1)
2416                         return (*this="");
2417
2418                 const s32 end = findLastCharNotInList(utf32white.const_pointer(), whitespace.used + 1);
2419
2420                 return (*this = subString(begin, (end +1) - begin));
2421         }
2422
2423
2424         //! Erases a character from the ustring16.
2425         //! May be slow, because all elements following after the erased element have to be copied.
2426         //! \param index Index of element to be erased.
2427         //! \return A reference to our current string.
2428         ustring16<TAlloc>& erase(u32 index)
2429         {
2430                 _IRR_DEBUG_BREAK_IF(index>used) // access violation
2431
2432                 iterator i(*this, index);
2433
2434                 uchar32_t t = *i;
2435                 u32 len = (t > 0xFFFF ? 2 : 1);
2436
2437                 for (u32 j = static_cast<u32>(i.getPos()) + len; j <= used; ++j)
2438                         array[j - len] = array[j];
2439
2440                 used -= len;
2441                 array[used] = 0;
2442
2443                 return *this;
2444         }
2445
2446
2447         //! Validate the existing ustring16, checking for valid surrogate pairs and checking for proper termination.
2448         //! \return A reference to our current string.
2449         ustring16<TAlloc>& validate()
2450         {
2451                 // Validate all unicode characters.
2452                 for (u32 i=0; i<allocated; ++i)
2453                 {
2454                         // Terminate on existing null.
2455                         if (array[i] == 0)
2456                         {
2457                                 used = i;
2458                                 return *this;
2459                         }
2460                         if (UTF16_IS_SURROGATE(array[i]))
2461                         {
2462                                 if (((i+1) >= allocated) || UTF16_IS_SURROGATE_LO(array[i]))
2463                                         array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2464                                 else if (UTF16_IS_SURROGATE_HI(array[i]) && !UTF16_IS_SURROGATE_LO(array[i+1]))
2465                                         array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2466                                 ++i;
2467                         }
2468                         if (array[i] >= 0xFDD0 && array[i] <= 0xFDEF)
2469                                 array[i] = unicode::UTF_REPLACEMENT_CHARACTER;
2470                 }
2471
2472                 // terminate
2473                 used = 0;
2474                 if (allocated > 0)
2475                 {
2476                         used = allocated - 1;
2477                         array[used] = 0;
2478                 }
2479                 return *this;
2480         }
2481
2482
2483         //! Gets the last char of the ustring16, or 0.
2484         //! \return The last char of the ustring16, or 0.
2485         uchar32_t lastChar() const
2486         {
2487                 if (used < 1)
2488                         return 0;
2489
2490                 if (UTF16_IS_SURROGATE_LO(array[used-1]))
2491                 {
2492                         // Make sure we have a paired surrogate.
2493                         if (used < 2)
2494                                 return 0;
2495
2496                         // Check for an invalid surrogate.
2497                         if (!UTF16_IS_SURROGATE_HI(array[used-2]))
2498                                 return 0;
2499
2500                         // Convert the surrogate pair into a single UTF-32 character.
2501                         return unicode::toUTF32(array[used-2], array[used-1]);
2502                 }
2503                 else
2504                 {
2505                         return array[used-1];
2506                 }
2507         }
2508
2509
2510         //! Split the ustring16 into parts.
2511         /** This method will split a ustring16 at certain delimiter characters
2512         into the container passed in as reference. The type of the container
2513         has to be given as template parameter. It must provide a push_back and
2514         a size method.
2515         \param ret The result container
2516         \param c C-style ustring16 of delimiter characters
2517         \param count Number of delimiter characters
2518         \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2519         container. If two delimiters occur without a character in between, an
2520         empty substring would be placed in the result. If this flag is set,
2521         only non-empty strings are stored.
2522         \param keepSeparators Flag which allows to add the separator to the
2523         result ustring16. If this flag is true, the concatenation of the
2524         substrings results in the original ustring16. Otherwise, only the
2525         characters between the delimiters are returned.
2526         \return The number of resulting substrings
2527         */
2528         template<class container>
2529         u32 split(container& ret, const uchar32_t* const c, u32 count=1, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2530         {
2531                 if (!c)
2532                         return 0;
2533
2534                 const_iterator i(*this);
2535                 const u32 oldSize=ret.size();
2536                 u32 pos = 0;
2537                 u32 lastpos = 0;
2538                 u32 lastpospos = 0;
2539                 bool lastWasSeparator = false;
2540                 while (!i.atEnd())
2541                 {
2542                         uchar32_t ch = *i;
2543                         bool foundSeparator = false;
2544                         for (u32 j=0; j<count; ++j)
2545                         {
2546                                 if (ch == c[j])
2547                                 {
2548                                         if ((!ignoreEmptyTokens || pos - lastpos != 0) &&
2549                                                         !lastWasSeparator)
2550                                         ret.push_back(ustring16<TAlloc>(&array[lastpospos], pos - lastpos));
2551                                         foundSeparator = true;
2552                                         lastpos = (keepSeparators ? pos : pos + 1);
2553                                         lastpospos = (keepSeparators ? i.getPos() : i.getPos() + 1);
2554                                         break;
2555                                 }
2556                         }
2557                         lastWasSeparator = foundSeparator;
2558                         ++pos;
2559                         ++i;
2560                 }
2561                 u32 s = size() + 1;
2562                 if (s > lastpos)
2563                         ret.push_back(ustring16<TAlloc>(&array[lastpospos], s - lastpos));
2564                 return ret.size()-oldSize;
2565         }
2566
2567
2568         //! Split the ustring16 into parts.
2569         /** This method will split a ustring16 at certain delimiter characters
2570         into the container passed in as reference. The type of the container
2571         has to be given as template parameter. It must provide a push_back and
2572         a size method.
2573         \param ret The result container
2574         \param c A unicode string of delimiter characters
2575         \param ignoreEmptyTokens Flag to avoid empty substrings in the result
2576         container. If two delimiters occur without a character in between, an
2577         empty substring would be placed in the result. If this flag is set,
2578         only non-empty strings are stored.
2579         \param keepSeparators Flag which allows to add the separator to the
2580         result ustring16. If this flag is true, the concatenation of the
2581         substrings results in the original ustring16. Otherwise, only the
2582         characters between the delimiters are returned.
2583         \return The number of resulting substrings
2584         */
2585         template<class container>
2586         u32 split(container& ret, const ustring16<TAlloc>& c, bool ignoreEmptyTokens=true, bool keepSeparators=false) const
2587         {
2588                 core::array<uchar32_t> v = c.toUTF32();
2589                 return split(ret, v.pointer(), v.size(), ignoreEmptyTokens, keepSeparators);
2590         }
2591
2592
2593         //! Gets the size of the allocated memory buffer for the string.
2594         //! \return The size of the allocated memory buffer.
2595         u32 capacity() const
2596         {
2597                 return allocated;
2598         }
2599
2600
2601         //! Returns the raw number of UTF-16 code points in the string which includes the individual surrogates.
2602         //! \return The raw number of UTF-16 code points, excluding the trialing NUL.
2603         u32 size_raw() const
2604         {
2605                 return used;
2606         }
2607
2608
2609         //! Inserts a character into the string.
2610         //! \param c The character to insert.
2611         //! \param pos The position to insert the character.
2612         //! \return A reference to our current string.
2613         ustring16<TAlloc>& insert(uchar32_t c, u32 pos)
2614         {
2615                 u8 len = (c > 0xFFFF ? 2 : 1);
2616
2617                 if (used + len >= allocated)
2618                         reallocate(used + len);
2619
2620                 used += len;
2621
2622                 iterator iter(*this, pos);
2623                 for (u32 i = used - 2; i > iter.getPos(); --i)
2624                         array[i] = array[i - len];
2625
2626                 if (c > 0xFFFF)
2627                 {
2628                         // c will be multibyte, so split it up into a surrogate pair.
2629                         uchar16_t x = static_cast<uchar16_t>(c);
2630                         uchar16_t vh = UTF16_HI_SURROGATE | ((((c >> 16) & ((1 << 5) - 1)) - 1) << 6) | (x >> 10);
2631                         uchar16_t vl = UTF16_LO_SURROGATE | (x & ((1 << 10) - 1));
2632                         array[iter.getPos()] = vh;
2633                         array[iter.getPos()+1] = vl;
2634                 }
2635                 else
2636                 {
2637                         array[iter.getPos()] = static_cast<uchar16_t>(c);
2638                 }
2639                 array[used] = 0;
2640                 return *this;
2641         }
2642
2643
2644         //! Inserts a string into the string.
2645         //! \param c The string to insert.
2646         //! \param pos The position to insert the string.
2647         //! \return A reference to our current string.
2648         ustring16<TAlloc>& insert(const ustring16<TAlloc>& c, u32 pos)
2649         {
2650                 u32 len = c.size_raw();
2651                 if (len == 0) return *this;
2652
2653                 if (used + len >= allocated)
2654                         reallocate(used + len);
2655
2656                 used += len;
2657
2658                 iterator iter(*this, pos);
2659                 for (u32 i = used - 2; i > iter.getPos() + len; --i)
2660                         array[i] = array[i - len];
2661
2662                 const uchar16_t* s = c.c_str();
2663                 for (u32 i = 0; i < len; ++i)
2664                 {
2665                         array[pos++] = *s;
2666                         ++s;
2667                 }
2668
2669                 array[used] = 0;
2670                 return *this;
2671         }
2672
2673
2674         //! Inserts a character into the string.
2675         //! \param c The character to insert.
2676         //! \param pos The position to insert the character.
2677         //! \return A reference to our current string.
2678         ustring16<TAlloc>& insert_raw(uchar16_t c, u32 pos)
2679         {
2680                 if (used + 1 >= allocated)
2681                         reallocate(used + 1);
2682
2683                 ++used;
2684
2685                 for (u32 i = used - 1; i > pos; --i)
2686                         array[i] = array[i - 1];
2687
2688                 array[pos] = c;
2689                 array[used] = 0;
2690                 return *this;
2691         }
2692
2693
2694         //! Removes a character from string.
2695         //! \param pos Position of the character to remove.
2696         //! \return A reference to our current string.
2697         ustring16<TAlloc>& erase_raw(u32 pos)
2698         {
2699                 for (u32 i=pos; i<=used; ++i)
2700                 {
2701                         array[i] = array[i + 1];
2702                 }
2703                 --used;
2704                 array[used] = 0;
2705                 return *this;
2706         }
2707
2708
2709         //! Replaces a character in the string.
2710         //! \param c The new character.
2711         //! \param pos The position of the character to replace.
2712         //! \return A reference to our current string.
2713         ustring16<TAlloc>& replace_raw(uchar16_t c, u32 pos)
2714         {
2715                 array[pos] = c;
2716                 return *this;
2717         }
2718
2719
2720         //! Returns an iterator to the beginning of the string.
2721         //! \return An iterator to the beginning of the string.
2722         iterator begin()
2723         {
2724                 iterator i(*this, 0);
2725                 return i;
2726         }
2727
2728
2729         //! Returns an iterator to the beginning of the string.
2730         //! \return An iterator to the beginning of the string.
2731         const_iterator begin() const
2732         {
2733                 const_iterator i(*this, 0);
2734                 return i;
2735         }
2736
2737
2738         //! Returns an iterator to the beginning of the string.
2739         //! \return An iterator to the beginning of the string.
2740         const_iterator cbegin() const
2741         {
2742                 const_iterator i(*this, 0);
2743                 return i;
2744         }
2745
2746
2747         //! Returns an iterator to the end of the string.
2748         //! \return An iterator to the end of the string.
2749         iterator end()
2750         {
2751                 iterator i(*this, 0);
2752                 i.toEnd();
2753                 return i;
2754         }
2755
2756
2757         //! Returns an iterator to the end of the string.
2758         //! \return An iterator to the end of the string.
2759         const_iterator end() const
2760         {
2761                 const_iterator i(*this, 0);
2762                 i.toEnd();
2763                 return i;
2764         }
2765
2766
2767         //! Returns an iterator to the end of the string.
2768         //! \return An iterator to the end of the string.
2769         const_iterator cend() const
2770         {
2771                 const_iterator i(*this, 0);
2772                 i.toEnd();
2773                 return i;
2774         }
2775
2776
2777         //! Converts the string to a UTF-8 encoded string.
2778         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2779         //! \return A string containing the UTF-8 encoded string.
2780         core::string<uchar8_t> toUTF8_s(const bool addBOM = false) const
2781         {
2782                 core::string<uchar8_t> ret;
2783                 ret.reserve(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2784                 const_iterator iter(*this, 0);
2785
2786                 // Add the byte order mark if the user wants it.
2787                 if (addBOM)
2788                 {
2789                         ret.append(unicode::BOM_ENCODE_UTF8[0]);
2790                         ret.append(unicode::BOM_ENCODE_UTF8[1]);
2791                         ret.append(unicode::BOM_ENCODE_UTF8[2]);
2792                 }
2793
2794                 while (!iter.atEnd())
2795                 {
2796                         uchar32_t c = *iter;
2797                         if (c > 0xFFFF)
2798                         {       // 4 bytes
2799                                 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2800                                 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2801                                 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2802                                 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2803                                 ret.append(b1);
2804                                 ret.append(b2);
2805                                 ret.append(b3);
2806                                 ret.append(b4);
2807                         }
2808                         else if (c > 0x7FF)
2809                         {       // 3 bytes
2810                                 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2811                                 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2812                                 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2813                                 ret.append(b1);
2814                                 ret.append(b2);
2815                                 ret.append(b3);
2816                         }
2817                         else if (c > 0x7F)
2818                         {       // 2 bytes
2819                                 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2820                                 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2821                                 ret.append(b1);
2822                                 ret.append(b2);
2823                         }
2824                         else
2825                         {       // 1 byte
2826                                 ret.append(static_cast<uchar8_t>(c));
2827                         }
2828                         ++iter;
2829                 }
2830                 return ret;
2831         }
2832
2833
2834         //! Converts the string to a UTF-8 encoded string array.
2835         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2836         //! \return An array containing the UTF-8 encoded string.
2837         core::array<uchar8_t> toUTF8(const bool addBOM = false) const
2838         {
2839                 core::array<uchar8_t> ret(used * 4 + (addBOM ? unicode::BOM_UTF8_LEN : 0) + 1);
2840                 const_iterator iter(*this, 0);
2841
2842                 // Add the byte order mark if the user wants it.
2843                 if (addBOM)
2844                 {
2845                         ret.push_back(unicode::BOM_ENCODE_UTF8[0]);
2846                         ret.push_back(unicode::BOM_ENCODE_UTF8[1]);
2847                         ret.push_back(unicode::BOM_ENCODE_UTF8[2]);
2848                 }
2849
2850                 while (!iter.atEnd())
2851                 {
2852                         uchar32_t c = *iter;
2853                         if (c > 0xFFFF)
2854                         {       // 4 bytes
2855                                 uchar8_t b1 = (0x1E << 3) | ((c >> 18) & 0x7);
2856                                 uchar8_t b2 = (0x2 << 6) | ((c >> 12) & 0x3F);
2857                                 uchar8_t b3 = (0x2 << 6) | ((c >> 6) & 0x3F);
2858                                 uchar8_t b4 = (0x2 << 6) | (c & 0x3F);
2859                                 ret.push_back(b1);
2860                                 ret.push_back(b2);
2861                                 ret.push_back(b3);
2862                                 ret.push_back(b4);
2863                         }
2864                         else if (c > 0x7FF)
2865                         {       // 3 bytes
2866                                 uchar8_t b1 = (0xE << 4) | ((c >> 12) & 0xF);
2867                                 uchar8_t b2 = (0x2 << 6) | ((c >> 6) & 0x3F);
2868                                 uchar8_t b3 = (0x2 << 6) | (c & 0x3F);
2869                                 ret.push_back(b1);
2870                                 ret.push_back(b2);
2871                                 ret.push_back(b3);
2872                         }
2873                         else if (c > 0x7F)
2874                         {       // 2 bytes
2875                                 uchar8_t b1 = (0x6 << 5) | ((c >> 6) & 0x1F);
2876                                 uchar8_t b2 = (0x2 << 6) | (c & 0x3F);
2877                                 ret.push_back(b1);
2878                                 ret.push_back(b2);
2879                         }
2880                         else
2881                         {       // 1 byte
2882                                 ret.push_back(static_cast<uchar8_t>(c));
2883                         }
2884                         ++iter;
2885                 }
2886                 ret.push_back(0);
2887                 return ret;
2888         }
2889
2890
2891 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
2892         //! Converts the string to a UTF-16 encoded string.
2893         //! \param endian The desired endianness of the string.
2894         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2895         //! \return A string containing the UTF-16 encoded string.
2896         core::string<char16_t> toUTF16_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2897         {
2898                 core::string<char16_t> ret;
2899                 ret.reserve(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2900
2901                 // Add the BOM if specified.
2902                 if (addBOM)
2903                 {
2904                         if (endian == unicode::EUTFEE_NATIVE)
2905                                 ret[0] = unicode::BOM;
2906                         else if (endian == unicode::EUTFEE_LITTLE)
2907                         {
2908                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ret.c_str());
2909                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2910                                 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2911                         }
2912                         else
2913                         {
2914                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ret.c_str());
2915                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2916                                 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2917                         }
2918                 }
2919
2920                 ret.append(array);
2921                 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2922                 {
2923                         char16_t* ptr = ret.c_str();
2924                         for (u32 i = 0; i < ret.size(); ++i)
2925                                 *ptr++ = unicode::swapEndian16(*ptr);
2926                 }
2927                 return ret;
2928         }
2929 #endif
2930
2931
2932         //! Converts the string to a UTF-16 encoded string array.
2933         //! Unfortunately, no toUTF16_s() version exists due to limitations with Irrlicht's string class.
2934         //! \param endian The desired endianness of the string.
2935         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2936         //! \return An array containing the UTF-16 encoded string.
2937         core::array<uchar16_t> toUTF16(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2938         {
2939                 core::array<uchar16_t> ret(used + (addBOM ? unicode::BOM_UTF16_LEN : 0) + 1);
2940                 uchar16_t* ptr = ret.pointer();
2941
2942                 // Add the BOM if specified.
2943                 if (addBOM)
2944                 {
2945                         if (endian == unicode::EUTFEE_NATIVE)
2946                                 *ptr = unicode::BOM;
2947                         else if (endian == unicode::EUTFEE_LITTLE)
2948                         {
2949                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2950                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_LE[0];
2951                                 *ptr8 = unicode::BOM_ENCODE_UTF16_LE[1];
2952                         }
2953                         else
2954                         {
2955                                 uchar8_t* ptr8 = reinterpret_cast<uchar8_t*>(ptr);
2956                                 *ptr8++ = unicode::BOM_ENCODE_UTF16_BE[0];
2957                                 *ptr8 = unicode::BOM_ENCODE_UTF16_BE[1];
2958                         }
2959                         ++ptr;
2960                 }
2961
2962                 memcpy((void*)ptr, (void*)array, used * sizeof(uchar16_t));
2963                 if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
2964                 {
2965                         for (u32 i = 0; i <= used; ++i)
2966                                 ptr[i] = unicode::swapEndian16(ptr[i]);
2967                 }
2968                 ret.set_used(used + (addBOM ? unicode::BOM_UTF16_LEN : 0));
2969                 ret.push_back(0);
2970                 return ret;
2971         }
2972
2973
2974 #ifdef USTRING_CPP0X_NEWLITERALS        // C++0x
2975         //! Converts the string to a UTF-32 encoded string.
2976         //! \param endian The desired endianness of the string.
2977         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
2978         //! \return A string containing the UTF-32 encoded string.
2979         core::string<char32_t> toUTF32_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
2980         {
2981                 core::string<char32_t> ret;
2982                 ret.reserve(size() + 1 + (addBOM ? unicode::BOM_UTF32_LEN : 0));
2983                 const_iterator iter(*this, 0);
2984
2985                 // Add the BOM if specified.
2986                 if (addBOM)
2987                 {
2988                         if (endian == unicode::EUTFEE_NATIVE)
2989                                 ret.append(unicode::BOM);
2990                         else
2991                         {
2992                                 union
2993                                 {
2994                                         uchar32_t full;
2995                                         u8 chunk[4];
2996                                 } t;
2997
2998                                 if (endian == unicode::EUTFEE_LITTLE)
2999                                 {
3000                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
3001                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
3002                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
3003                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
3004                                 }
3005                                 else
3006                                 {
3007                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
3008                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
3009                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
3010                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
3011                                 }
3012                                 ret.append(t.full);
3013                         }
3014                 }
3015
3016                 while (!iter.atEnd())
3017                 {
3018                         uchar32_t c = *iter;
3019                         if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
3020                                 c = unicode::swapEndian32(c);
3021                         ret.append(c);
3022                         ++iter;
3023                 }
3024                 return ret;
3025         }
3026 #endif
3027
3028
3029         //! Converts the string to a UTF-32 encoded string array.
3030         //! Unfortunately, no toUTF32_s() version exists due to limitations with Irrlicht's string class.
3031         //! \param endian The desired endianness of the string.
3032         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3033         //! \return An array containing the UTF-32 encoded string.
3034         core::array<uchar32_t> toUTF32(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3035         {
3036                 core::array<uchar32_t> ret(size() + (addBOM ? unicode::BOM_UTF32_LEN : 0) + 1);
3037                 const_iterator iter(*this, 0);
3038
3039                 // Add the BOM if specified.
3040                 if (addBOM)
3041                 {
3042                         if (endian == unicode::EUTFEE_NATIVE)
3043                                 ret.push_back(unicode::BOM);
3044                         else
3045                         {
3046                                 union
3047                                 {
3048                                         uchar32_t full;
3049                                         u8 chunk[4];
3050                                 } t;
3051
3052                                 if (endian == unicode::EUTFEE_LITTLE)
3053                                 {
3054                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_LE[0];
3055                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_LE[1];
3056                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_LE[2];
3057                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_LE[3];
3058                                 }
3059                                 else
3060                                 {
3061                                         t.chunk[0] = unicode::BOM_ENCODE_UTF32_BE[0];
3062                                         t.chunk[1] = unicode::BOM_ENCODE_UTF32_BE[1];
3063                                         t.chunk[2] = unicode::BOM_ENCODE_UTF32_BE[2];
3064                                         t.chunk[3] = unicode::BOM_ENCODE_UTF32_BE[3];
3065                                 }
3066                                 ret.push_back(t.full);
3067                         }
3068                 }
3069                 ret.push_back(0);
3070
3071                 while (!iter.atEnd())
3072                 {
3073                         uchar32_t c = *iter;
3074                         if (endian != unicode::EUTFEE_NATIVE && getEndianness() != endian)
3075                                 c = unicode::swapEndian32(c);
3076                         ret.push_back(c);
3077                         ++iter;
3078                 }
3079                 return ret;
3080         }
3081
3082
3083         //! Converts the string to a wchar_t encoded string.
3084         /** The size of a wchar_t changes depending on the platform.  This function will store a
3085         correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
3086         //! \param endian The desired endianness of the string.
3087         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3088         //! \return A string containing the wchar_t encoded string.
3089         core::string<wchar_t> toWCHAR_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3090         {
3091                 if (sizeof(wchar_t) == 4)
3092                 {
3093                         core::array<uchar32_t> a(toUTF32(endian, addBOM));
3094                         core::stringw ret(a.pointer());
3095                         return ret;
3096                 }
3097                 else if (sizeof(wchar_t) == 2)
3098                 {
3099                         if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
3100                         {
3101                                 core::stringw ret(array);
3102                                 return ret;
3103                         }
3104                         else
3105                         {
3106                                 core::array<uchar16_t> a(toUTF16(endian, addBOM));
3107                                 core::stringw ret(a.pointer());
3108                                 return ret;
3109                         }
3110                 }
3111                 else if (sizeof(wchar_t) == 1)
3112                 {
3113                         core::array<uchar8_t> a(toUTF8(addBOM));
3114                         core::stringw ret(a.pointer());
3115                         return ret;
3116                 }
3117
3118                 // Shouldn't happen.
3119                 return core::stringw();
3120         }
3121
3122
3123         //! Converts the string to a wchar_t encoded string array.
3124         /** The size of a wchar_t changes depending on the platform.  This function will store a
3125         correct UTF-8, -16, or -32 encoded string depending on the size of a wchar_t. **/
3126         //! \param endian The desired endianness of the string.
3127         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3128         //! \return An array containing the wchar_t encoded string.
3129         core::array<wchar_t> toWCHAR(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3130         {
3131                 if (sizeof(wchar_t) == 4)
3132                 {
3133                         core::array<uchar32_t> a(toUTF32(endian, addBOM));
3134                         core::array<wchar_t> ret(a.size());
3135                         ret.set_used(a.size());
3136                         memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar32_t));
3137                         return ret;
3138                 }
3139                 if (sizeof(wchar_t) == 2)
3140                 {
3141                         if (endian == unicode::EUTFEE_NATIVE && addBOM == false)
3142                         {
3143                                 core::array<wchar_t> ret(used);
3144                                 ret.set_used(used);
3145                                 memcpy((void*)ret.pointer(), (void*)array, used * sizeof(uchar16_t));
3146                                 return ret;
3147                         }
3148                         else
3149                         {
3150                                 core::array<uchar16_t> a(toUTF16(endian, addBOM));
3151                                 core::array<wchar_t> ret(a.size());
3152                                 ret.set_used(a.size());
3153                                 memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar16_t));
3154                                 return ret;
3155                         }
3156                 }
3157                 if (sizeof(wchar_t) == 1)
3158                 {
3159                         core::array<uchar8_t> a(toUTF8(addBOM));
3160                         core::array<wchar_t> ret(a.size());
3161                         ret.set_used(a.size());
3162                         memcpy((void*)ret.pointer(), (void*)a.pointer(), a.size() * sizeof(uchar8_t));
3163                         return ret;
3164                 }
3165
3166                 // Shouldn't happen.
3167                 return core::array<wchar_t>();
3168         }
3169
3170         //! Converts the string to a properly encoded io::path string.
3171         //! \param endian The desired endianness of the string.
3172         //! \param addBOM If true, the proper unicode byte-order mark will be prefixed to the string.
3173         //! \return An io::path string containing the properly encoded string.
3174         io::path toPATH_s(const unicode::EUTF_ENDIAN endian = unicode::EUTFEE_NATIVE, const bool addBOM = false) const
3175         {
3176 #if defined(_IRR_WCHAR_FILESYSTEM)
3177                 return toWCHAR_s(endian, addBOM);
3178 #else
3179                 return toUTF8_s(addBOM);
3180 #endif
3181         }
3182
3183         //! Loads an unknown stream of data.
3184         //! Will attempt to determine if the stream is unicode data.  Useful for loading from files.
3185         //! \param data The data stream to load from.
3186         //! \param data_size The length of the data string.
3187         //! \return A reference to our current string.
3188         ustring16<TAlloc>& loadDataStream(const char* data, size_t data_size)
3189         {
3190                 // Clear our string.
3191                 *this = "";
3192                 if (!data)
3193                         return *this;
3194
3195                 unicode::EUTF_ENCODE e = unicode::determineUnicodeBOM(data);
3196                 switch (e)
3197                 {
3198                         default:
3199                         case unicode::EUTFE_UTF8:
3200                                 append((uchar8_t*)data, data_size);
3201                                 break;
3202
3203                         case unicode::EUTFE_UTF16:
3204                         case unicode::EUTFE_UTF16_BE:
3205                         case unicode::EUTFE_UTF16_LE:
3206                                 append((uchar16_t*)data, data_size / 2);
3207                                 break;
3208
3209                         case unicode::EUTFE_UTF32:
3210                         case unicode::EUTFE_UTF32_BE:
3211                         case unicode::EUTFE_UTF32_LE:
3212                                 append((uchar32_t*)data, data_size / 4);
3213                                 break;
3214                 }
3215
3216                 return *this;
3217         }
3218
3219         //! Gets the encoding of the Unicode string this class contains.
3220         //! \return An enum describing the current encoding of this string.
3221         const unicode::EUTF_ENCODE getEncoding() const
3222         {
3223                 return encoding;
3224         }
3225
3226         //! Gets the endianness of the Unicode string this class contains.
3227         //! \return An enum describing the endianness of this string.
3228         const unicode::EUTF_ENDIAN getEndianness() const
3229         {
3230                 if (encoding == unicode::EUTFE_UTF16_LE ||
3231                         encoding == unicode::EUTFE_UTF32_LE)
3232                         return unicode::EUTFEE_LITTLE;
3233                 else return unicode::EUTFEE_BIG;
3234         }
3235
3236 private:
3237
3238         //! Reallocate the string, making it bigger or smaller.
3239         //! \param new_size The new size of the string.
3240         void reallocate(u32 new_size)
3241         {
3242                 uchar16_t* old_array = array;
3243
3244                 array = allocator.allocate(new_size + 1); //new u16[new_size];
3245                 allocated = new_size + 1;
3246                 if (old_array == 0) return;
3247
3248                 u32 amount = used < new_size ? used : new_size;
3249                 for (u32 i=0; i<=amount; ++i)
3250                         array[i] = old_array[i];
3251
3252                 if (allocated <= used)
3253                         used = allocated - 1;
3254
3255                 array[used] = 0;
3256
3257                 allocator.deallocate(old_array); // delete [] old_array;
3258         }
3259
3260         //--- member variables
3261
3262         uchar16_t* array;
3263         unicode::EUTF_ENCODE encoding;
3264         u32 allocated;
3265         u32 used;
3266         TAlloc allocator;
3267         //irrAllocator<uchar16_t> allocator;
3268 };
3269
3270 typedef ustring16<irrAllocator<uchar16_t> > ustring;
3271
3272
3273 //! Appends two ustring16s.
3274 template <typename TAlloc>
3275 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const ustring16<TAlloc>& right)
3276 {
3277         ustring16<TAlloc> ret(left);
3278         ret += right;
3279         return ret;
3280 }
3281
3282
3283 //! Appends a ustring16 and a null-terminated unicode string.
3284 template <typename TAlloc, class B>
3285 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const B* const right)
3286 {
3287         ustring16<TAlloc> ret(left);
3288         ret += right;
3289         return ret;
3290 }
3291
3292
3293 //! Appends a ustring16 and a null-terminated unicode string.
3294 template <class B, typename TAlloc>
3295 inline ustring16<TAlloc> operator+(const B* const left, const ustring16<TAlloc>& right)
3296 {
3297         ustring16<TAlloc> ret(left);
3298         ret += right;
3299         return ret;
3300 }
3301
3302
3303 //! Appends a ustring16 and an Irrlicht string.
3304 template <typename TAlloc, typename B, typename BAlloc>
3305 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const string<B, BAlloc>& right)
3306 {
3307         ustring16<TAlloc> ret(left);
3308         ret += right;
3309         return ret;
3310 }
3311
3312
3313 //! Appends a ustring16 and an Irrlicht string.
3314 template <typename TAlloc, typename B, typename BAlloc>
3315 inline ustring16<TAlloc> operator+(const string<B, BAlloc>& left, const ustring16<TAlloc>& right)
3316 {
3317         ustring16<TAlloc> ret(left);
3318         ret += right;
3319         return ret;
3320 }
3321
3322
3323 //! Appends a ustring16 and a std::basic_string.
3324 template <typename TAlloc, typename B, typename A, typename BAlloc>
3325 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const std::basic_string<B, A, BAlloc>& right)
3326 {
3327         ustring16<TAlloc> ret(left);
3328         ret += right;
3329         return ret;
3330 }
3331
3332
3333 //! Appends a ustring16 and a std::basic_string.
3334 template <typename TAlloc, typename B, typename A, typename BAlloc>
3335 inline ustring16<TAlloc> operator+(const std::basic_string<B, A, BAlloc>& left, const ustring16<TAlloc>& right)
3336 {
3337         ustring16<TAlloc> ret(left);
3338         ret += right;
3339         return ret;
3340 }
3341
3342
3343 //! Appends a ustring16 and a char.
3344 template <typename TAlloc>
3345 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const char right)
3346 {
3347         ustring16<TAlloc> ret(left);
3348         ret += right;
3349         return ret;
3350 }
3351
3352
3353 //! Appends a ustring16 and a char.
3354 template <typename TAlloc>
3355 inline ustring16<TAlloc> operator+(const char left, const ustring16<TAlloc>& right)
3356 {
3357         ustring16<TAlloc> ret(left);
3358         ret += right;
3359         return ret;
3360 }
3361
3362
3363 #ifdef USTRING_CPP0X_NEWLITERALS
3364 //! Appends a ustring16 and a uchar32_t.
3365 template <typename TAlloc>
3366 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const uchar32_t right)
3367 {
3368         ustring16<TAlloc> ret(left);
3369         ret += right;
3370         return ret;
3371 }
3372
3373
3374 //! Appends a ustring16 and a uchar32_t.
3375 template <typename TAlloc>
3376 inline ustring16<TAlloc> operator+(const uchar32_t left, const ustring16<TAlloc>& right)
3377 {
3378         ustring16<TAlloc> ret(left);
3379         ret += right;
3380         return ret;
3381 }
3382 #endif
3383
3384
3385 //! Appends a ustring16 and a short.
3386 template <typename TAlloc>
3387 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const short right)
3388 {
3389         ustring16<TAlloc> ret(left);
3390         ret += core::stringc(right);
3391         return ret;
3392 }
3393
3394
3395 //! Appends a ustring16 and a short.
3396 template <typename TAlloc>
3397 inline ustring16<TAlloc> operator+(const short left, const ustring16<TAlloc>& right)
3398 {
3399         ustring16<TAlloc> ret((core::stringc(left)));
3400         ret += right;
3401         return ret;
3402 }
3403
3404
3405 //! Appends a ustring16 and an unsigned short.
3406 template <typename TAlloc>
3407 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned short right)
3408 {
3409         ustring16<TAlloc> ret(left);
3410         ret += core::stringc(right);
3411         return ret;
3412 }
3413
3414
3415 //! Appends a ustring16 and an unsigned short.
3416 template <typename TAlloc>
3417 inline ustring16<TAlloc> operator+(const unsigned short left, const ustring16<TAlloc>& right)
3418 {
3419         ustring16<TAlloc> ret((core::stringc(left)));
3420         ret += right;
3421         return ret;
3422 }
3423
3424
3425 //! Appends a ustring16 and an int.
3426 template <typename TAlloc>
3427 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const int right)
3428 {
3429         ustring16<TAlloc> ret(left);
3430         ret += core::stringc(right);
3431         return ret;
3432 }
3433
3434
3435 //! Appends a ustring16 and an int.
3436 template <typename TAlloc>
3437 inline ustring16<TAlloc> operator+(const int left, const ustring16<TAlloc>& right)
3438 {
3439         ustring16<TAlloc> ret((core::stringc(left)));
3440         ret += right;
3441         return ret;
3442 }
3443
3444
3445 //! Appends a ustring16 and an unsigned int.
3446 template <typename TAlloc>
3447 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned int right)
3448 {
3449         ustring16<TAlloc> ret(left);
3450         ret += core::stringc(right);
3451         return ret;
3452 }
3453
3454
3455 //! Appends a ustring16 and an unsigned int.
3456 template <typename TAlloc>
3457 inline ustring16<TAlloc> operator+(const unsigned int left, const ustring16<TAlloc>& right)
3458 {
3459         ustring16<TAlloc> ret((core::stringc(left)));
3460         ret += right;
3461         return ret;
3462 }
3463
3464
3465 //! Appends a ustring16 and a long.
3466 template <typename TAlloc>
3467 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const long right)
3468 {
3469         ustring16<TAlloc> ret(left);
3470         ret += core::stringc(right);
3471         return ret;
3472 }
3473
3474
3475 //! Appends a ustring16 and a long.
3476 template <typename TAlloc>
3477 inline ustring16<TAlloc> operator+(const long left, const ustring16<TAlloc>& right)
3478 {
3479         ustring16<TAlloc> ret((core::stringc(left)));
3480         ret += right;
3481         return ret;
3482 }
3483
3484
3485 //! Appends a ustring16 and an unsigned long.
3486 template <typename TAlloc>
3487 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const unsigned long right)
3488 {
3489         ustring16<TAlloc> ret(left);
3490         ret += core::stringc(right);
3491         return ret;
3492 }
3493
3494
3495 //! Appends a ustring16 and an unsigned long.
3496 template <typename TAlloc>
3497 inline ustring16<TAlloc> operator+(const unsigned long left, const ustring16<TAlloc>& right)
3498 {
3499         ustring16<TAlloc> ret((core::stringc(left)));
3500         ret += right;
3501         return ret;
3502 }
3503
3504
3505 //! Appends a ustring16 and a float.
3506 template <typename TAlloc>
3507 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const float right)
3508 {
3509         ustring16<TAlloc> ret(left);
3510         ret += core::stringc(right);
3511         return ret;
3512 }
3513
3514
3515 //! Appends a ustring16 and a float.
3516 template <typename TAlloc>
3517 inline ustring16<TAlloc> operator+(const float left, const ustring16<TAlloc>& right)
3518 {
3519         ustring16<TAlloc> ret((core::stringc(left)));
3520         ret += right;
3521         return ret;
3522 }
3523
3524
3525 //! Appends a ustring16 and a double.
3526 template <typename TAlloc>
3527 inline ustring16<TAlloc> operator+(const ustring16<TAlloc>& left, const double right)
3528 {
3529         ustring16<TAlloc> ret(left);
3530         ret += core::stringc(right);
3531         return ret;
3532 }
3533
3534
3535 //! Appends a ustring16 and a double.
3536 template <typename TAlloc>
3537 inline ustring16<TAlloc> operator+(const double left, const ustring16<TAlloc>& right)
3538 {
3539         ustring16<TAlloc> ret((core::stringc(left)));
3540         ret += right;
3541         return ret;
3542 }
3543
3544
3545 #ifdef USTRING_CPP0X
3546 //! Appends two ustring16s.
3547 template <typename TAlloc>
3548 inline ustring16<TAlloc>&& operator+(const ustring16<TAlloc>& left, ustring16<TAlloc>&& right)
3549 {
3550         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3551         right.insert(left, 0);
3552         return std::move(right);
3553 }
3554
3555
3556 //! Appends two ustring16s.
3557 template <typename TAlloc>
3558 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const ustring16<TAlloc>& right)
3559 {
3560         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3561         left.append(right);
3562         return std::move(left);
3563 }
3564
3565
3566 //! Appends two ustring16s.
3567 template <typename TAlloc>
3568 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, ustring16<TAlloc>&& right)
3569 {
3570         //std::cout << "MOVE operator+(&&, &&)" << std::endl;
3571         if ((right.size_raw() <= left.capacity() - left.size_raw()) ||
3572                 (right.capacity() - right.size_raw() < left.size_raw()))
3573         {
3574                 left.append(right);
3575                 return std::move(left);
3576         }
3577         else
3578         {
3579                 right.insert(left, 0);
3580                 return std::move(right);
3581         }
3582 }
3583
3584
3585 //! Appends a ustring16 and a null-terminated unicode string.
3586 template <typename TAlloc, class B>
3587 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const B* const right)
3588 {
3589         //std::cout << "MOVE operator+(&&, B*)" << std::endl;
3590         left.append(right);
3591         return std::move(left);
3592 }
3593
3594
3595 //! Appends a ustring16 and a null-terminated unicode string.
3596 template <class B, typename TAlloc>
3597 inline ustring16<TAlloc>&& operator+(const B* const left, ustring16<TAlloc>&& right)
3598 {
3599         //std::cout << "MOVE operator+(B*, &&)" << std::endl;
3600         right.insert(left, 0);
3601         return std::move(right);
3602 }
3603
3604
3605 //! Appends a ustring16 and an Irrlicht string.
3606 template <typename TAlloc, typename B, typename BAlloc>
3607 inline ustring16<TAlloc>&& operator+(const string<B, BAlloc>& left, ustring16<TAlloc>&& right)
3608 {
3609         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3610         right.insert(left, 0);
3611         return std::move(right);
3612 }
3613
3614
3615 //! Appends a ustring16 and an Irrlicht string.
3616 template <typename TAlloc, typename B, typename BAlloc>
3617 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const string<B, BAlloc>& right)
3618 {
3619         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3620         left.append(right);
3621         return std::move(left);
3622 }
3623
3624
3625 //! Appends a ustring16 and a std::basic_string.
3626 template <typename TAlloc, typename B, typename A, typename BAlloc>
3627 inline ustring16<TAlloc>&& operator+(const std::basic_string<B, A, BAlloc>& left, ustring16<TAlloc>&& right)
3628 {
3629         //std::cout << "MOVE operator+(&, &&)" << std::endl;
3630         right.insert(core::ustring16<TAlloc>(left), 0);
3631         return std::move(right);
3632 }
3633
3634
3635 //! Appends a ustring16 and a std::basic_string.
3636 template <typename TAlloc, typename B, typename A, typename BAlloc>
3637 inline ustring16<TAlloc>&& operator+(ustring16<TAlloc>&& left, const std::basic_string<B, A, BAlloc>& right)
3638 {
3639         //std::cout << "MOVE operator+(&&, &)" << std::endl;
3640         left.append(right);
3641         return std::move(left);
3642 }
3643
3644
3645 //! Appends a ustring16 and a char.
3646 template <typename TAlloc>
3647 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const char right)
3648 {
3649         left.append((uchar32_t)right);
3650         return std::move(left);
3651 }
3652
3653
3654 //! Appends a ustring16 and a char.
3655 template <typename TAlloc>
3656 inline ustring16<TAlloc> operator+(const char left, ustring16<TAlloc>&& right)
3657 {
3658         right.insert((uchar32_t)left, 0);
3659         return std::move(right);
3660 }
3661
3662
3663 #ifdef USTRING_CPP0X_NEWLITERALS
3664 //! Appends a ustring16 and a uchar32_t.
3665 template <typename TAlloc>
3666 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const uchar32_t right)
3667 {
3668         left.append(right);
3669         return std::move(left);
3670 }
3671
3672
3673 //! Appends a ustring16 and a uchar32_t.
3674 template <typename TAlloc>
3675 inline ustring16<TAlloc> operator+(const uchar32_t left, ustring16<TAlloc>&& right)
3676 {
3677         right.insert(left, 0);
3678         return std::move(right);
3679 }
3680 #endif
3681
3682
3683 //! Appends a ustring16 and a short.
3684 template <typename TAlloc>
3685 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const short right)
3686 {
3687         left.append(core::stringc(right));
3688         return std::move(left);
3689 }
3690
3691
3692 //! Appends a ustring16 and a short.
3693 template <typename TAlloc>
3694 inline ustring16<TAlloc> operator+(const short left, ustring16<TAlloc>&& right)
3695 {
3696         right.insert(core::stringc(left), 0);
3697         return std::move(right);
3698 }
3699
3700
3701 //! Appends a ustring16 and an unsigned short.
3702 template <typename TAlloc>
3703 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned short right)
3704 {
3705         left.append(core::stringc(right));
3706         return std::move(left);
3707 }
3708
3709
3710 //! Appends a ustring16 and an unsigned short.
3711 template <typename TAlloc>
3712 inline ustring16<TAlloc> operator+(const unsigned short left, ustring16<TAlloc>&& right)
3713 {
3714         right.insert(core::stringc(left), 0);
3715         return std::move(right);
3716 }
3717
3718
3719 //! Appends a ustring16 and an int.
3720 template <typename TAlloc>
3721 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const int right)
3722 {
3723         left.append(core::stringc(right));
3724         return std::move(left);
3725 }
3726
3727
3728 //! Appends a ustring16 and an int.
3729 template <typename TAlloc>
3730 inline ustring16<TAlloc> operator+(const int left, ustring16<TAlloc>&& right)
3731 {
3732         right.insert(core::stringc(left), 0);
3733         return std::move(right);
3734 }
3735
3736
3737 //! Appends a ustring16 and an unsigned int.
3738 template <typename TAlloc>
3739 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned int right)
3740 {
3741         left.append(core::stringc(right));
3742         return std::move(left);
3743 }
3744
3745
3746 //! Appends a ustring16 and an unsigned int.
3747 template <typename TAlloc>
3748 inline ustring16<TAlloc> operator+(const unsigned int left, ustring16<TAlloc>&& right)
3749 {
3750         right.insert(core::stringc(left), 0);
3751         return std::move(right);
3752 }
3753
3754
3755 //! Appends a ustring16 and a long.
3756 template <typename TAlloc>
3757 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const long right)
3758 {
3759         left.append(core::stringc(right));
3760         return std::move(left);
3761 }
3762
3763
3764 //! Appends a ustring16 and a long.
3765 template <typename TAlloc>
3766 inline ustring16<TAlloc> operator+(const long left, ustring16<TAlloc>&& right)
3767 {
3768         right.insert(core::stringc(left), 0);
3769         return std::move(right);
3770 }
3771
3772
3773 //! Appends a ustring16 and an unsigned long.
3774 template <typename TAlloc>
3775 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const unsigned long right)
3776 {
3777         left.append(core::stringc(right));
3778         return std::move(left);
3779 }
3780
3781
3782 //! Appends a ustring16 and an unsigned long.
3783 template <typename TAlloc>
3784 inline ustring16<TAlloc> operator+(const unsigned long left, ustring16<TAlloc>&& right)
3785 {
3786         right.insert(core::stringc(left), 0);
3787         return std::move(right);
3788 }
3789
3790
3791 //! Appends a ustring16 and a float.
3792 template <typename TAlloc>
3793 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const float right)
3794 {
3795         left.append(core::stringc(right));
3796         return std::move(left);
3797 }
3798
3799
3800 //! Appends a ustring16 and a float.
3801 template <typename TAlloc>
3802 inline ustring16<TAlloc> operator+(const float left, ustring16<TAlloc>&& right)
3803 {
3804         right.insert(core::stringc(left), 0);
3805         return std::move(right);
3806 }
3807
3808
3809 //! Appends a ustring16 and a double.
3810 template <typename TAlloc>
3811 inline ustring16<TAlloc> operator+(ustring16<TAlloc>&& left, const double right)
3812 {
3813         left.append(core::stringc(right));
3814         return std::move(left);
3815 }
3816
3817
3818 //! Appends a ustring16 and a double.
3819 template <typename TAlloc>
3820 inline ustring16<TAlloc> operator+(const double left, ustring16<TAlloc>&& right)
3821 {
3822         right.insert(core::stringc(left), 0);
3823         return std::move(right);
3824 }
3825 #endif
3826
3827
3828 #ifndef USTRING_NO_STL
3829 //! Writes a ustring16 to an ostream.
3830 template <typename TAlloc>
3831 inline std::ostream& operator<<(std::ostream& out, const ustring16<TAlloc>& in)
3832 {
3833         out << in.toUTF8_s().c_str();
3834         return out;
3835 }
3836
3837 //! Writes a ustring16 to a wostream.
3838 template <typename TAlloc>
3839 inline std::wostream& operator<<(std::wostream& out, const ustring16<TAlloc>& in)
3840 {
3841         out << in.toWCHAR_s().c_str();
3842         return out;
3843 }
3844 #endif
3845
3846
3847 #ifndef USTRING_NO_STL
3848
3849 namespace unicode
3850 {
3851
3852 //! Hashing algorithm for hashing a ustring.  Used for things like unordered_maps.
3853 //! Algorithm taken from std::hash<std::string>.
3854 class hash : public std::unary_function<core::ustring, size_t>
3855 {
3856         public:
3857                 size_t operator()(const core::ustring& s) const
3858                 {
3859                         size_t ret = 2166136261U;
3860                         size_t index = 0;
3861                         size_t stride = 1 + s.size_raw() / 10;
3862
3863                         core::ustring::const_iterator i = s.begin();
3864                         while (i != s.end())
3865                         {
3866                                 // TODO: Don't force u32 on an x64 OS.  Make it agnostic.
3867                                 ret = 16777619U * ret ^ (size_t)s[(u32)index];
3868                                 index += stride;
3869                                 i += stride;
3870                         }
3871                         return (ret);
3872                 }
3873 };
3874
3875 } // end namespace unicode
3876
3877 #endif
3878
3879 } // end namespace core
3880 } // end namespace irr
3881
3882 #endif