505 lines
17 KiB
C++
505 lines
17 KiB
C++
/*
|
|
* (c) Copyright Ascensio System SIA 2010-2023
|
|
*
|
|
* This program is a free software product. You can redistribute it and/or
|
|
* modify it under the terms of the GNU Affero General Public License (AGPL)
|
|
* version 3 as published by the Free Software Foundation. In accordance with
|
|
* Section 7(a) of the GNU AGPL its Section 15 shall be amended to the effect
|
|
* that Ascensio System SIA expressly excludes the warranty of non-infringement
|
|
* of any third-party rights.
|
|
*
|
|
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
|
|
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. For
|
|
* details, see the GNU AGPL at: http://www.gnu.org/licenses/agpl-3.0.html
|
|
*
|
|
* You can contact Ascensio System SIA at 20A-6 Ernesta Birznieka-Upish
|
|
* street, Riga, Latvia, EU, LV-1050.
|
|
*
|
|
* The interactive user interfaces in modified source and object code versions
|
|
* of the Program must display Appropriate Legal Notices, as required under
|
|
* Section 5 of the GNU AGPL version 3.
|
|
*
|
|
* Pursuant to Section 7(b) of the License you must retain the original Product
|
|
* logo when distributing the program. Pursuant to Section 7(e) we decline to
|
|
* grant you any rights under trademark law for use of our trademarks.
|
|
*
|
|
* All the Product's GUI elements, including illustrations and icon sets, as
|
|
* well as technical writing content are licensed under the terms of the
|
|
* Creative Commons Attribution-ShareAlike 4.0 International. See the License
|
|
* terms at http://creativecommons.org/licenses/by-sa/4.0/legalcode
|
|
*
|
|
*/
|
|
#include "CPEncodings/CodePage.h"
|
|
#include "StringExt.h"
|
|
|
|
namespace NSStringExt
|
|
{
|
|
#define NSSTRING_COMMON_CP(UnicodeMapCP, lCount, pData) \
|
|
for (long i = 0; i < lCount; ++i)\
|
|
{\
|
|
unsigned char unChar = (unsigned char)pData[i];\
|
|
if (unChar < MSCP_FIRST_CHAR || unChar > MSCP_LAST_CHAR)\
|
|
pUnicode[i] = (wchar_t)unChar;\
|
|
else\
|
|
pUnicode[i] = (wchar_t)(UnicodeMapCP[unChar - MSCP_FIRST_CHAR]);\
|
|
}
|
|
// end define
|
|
|
|
void NSSTRING_WITHLEADBYTE_CP(wchar_t** ppUnicode, unsigned short LEAD_CHAR, const unsigned short* UnicodeMapCP, const TCodePagePair* UnicodeMapWithLeadByte, long lCount, const unsigned char* pData)
|
|
{
|
|
int nLeadByte = -1;
|
|
int nUnicodePos = 0;
|
|
for (long i = 0; i < lCount; ++i)
|
|
{
|
|
unsigned char unCode = (unsigned char)pData[i];
|
|
unsigned short ushUnicode = UnicodeMapCP[unCode];
|
|
if (-1 == nLeadByte)
|
|
{
|
|
if (LEAD_CHAR != ushUnicode)
|
|
{
|
|
(*ppUnicode)[nUnicodePos++] = ushUnicode;
|
|
nLeadByte = -1;
|
|
}
|
|
else
|
|
{
|
|
nLeadByte = unCode;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
unsigned short ushCode = (nLeadByte << 8) | unCode;
|
|
TCodePagePair *pPair = (TCodePagePair*)UnicodeMapWithLeadByte;
|
|
while (0xFFFF != pPair->ushCode)
|
|
{
|
|
if (ushCode == pPair->ushCode)
|
|
{
|
|
(*ppUnicode)[nUnicodePos++] = pPair->ushUnicode;
|
|
break;
|
|
}
|
|
pPair++;
|
|
}
|
|
nLeadByte = -1;
|
|
}
|
|
}
|
|
(*ppUnicode)[nUnicodePos++] = 0x0000;
|
|
}
|
|
|
|
std::wstring CConverter::GetUnicodeFromSingleByteString(const unsigned char* pData, long lCount, ESingleByteEncoding eType)
|
|
{
|
|
wchar_t* pUnicode = new wchar_t[lCount + 1];
|
|
if (!pUnicode)
|
|
return std::wstring(L"");
|
|
|
|
switch (eType)
|
|
{
|
|
default:
|
|
case SINGLE_BYTE_ENCODING_DEFAULT:
|
|
{
|
|
for (long i = 0; i < lCount; ++i)
|
|
pUnicode[i] = (wchar_t)(unsigned char)pData[i];
|
|
|
|
break;
|
|
}
|
|
case SINGLE_BYTE_ENCODING_SYMBOL:
|
|
{
|
|
// Добавляем 0xF000 к кодам всех символов
|
|
for (long i = 0; i < lCount; ++i)
|
|
{
|
|
pUnicode[i] = (wchar_t)(0xF000 | (unsigned char)pData[i]);
|
|
}
|
|
|
|
break;
|
|
}
|
|
case SINGLE_BYTE_ENCODING_CP866: NSSTRING_COMMON_CP(NSStringExt::c_anUnicodeMapCP866, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP874: NSSTRING_COMMON_CP(NSStringExt::c_anUnicodeMapCP874, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP1250: NSSTRING_COMMON_CP(NSStringExt::c_anUnicodeMapCP1250, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP1251: NSSTRING_COMMON_CP(NSStringExt::c_anUnicodeMapCP1251, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP1252: NSSTRING_COMMON_CP(NSStringExt::c_anUnicodeMapCP1252, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP1253: NSSTRING_COMMON_CP(NSStringExt::c_anUnicodeMapCP1253, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP1254: NSSTRING_COMMON_CP(NSStringExt::c_anUnicodeMapCP1254, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP1255: NSSTRING_COMMON_CP(NSStringExt::c_anUnicodeMapCP1255, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP1256: NSSTRING_COMMON_CP(NSStringExt::c_anUnicodeMapCP1256, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP1257: NSSTRING_COMMON_CP(NSStringExt::c_anUnicodeMapCP1257, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP1258: NSSTRING_COMMON_CP(NSStringExt::c_anUnicodeMapCP1258, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP932: NSSTRING_WITHLEADBYTE_CP(&pUnicode, MSCP932_LEAD_CHAR, NSStringExt::c_anUnicodeMapCP932, c_aoUnicodeMapCP932WithLeadByte, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP936: NSSTRING_WITHLEADBYTE_CP(&pUnicode, MSCP936_LEAD_CHAR, NSStringExt::c_anUnicodeMapCP936, NSStringExt::c_aoUnicodeMapCP936WithLeadByte, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP949: NSSTRING_WITHLEADBYTE_CP(&pUnicode, MSCP949_LEAD_CHAR, NSStringExt::c_anUnicodeMapCP949, NSStringExt::c_aoUnicodeMapCP949WithLeadByte, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP950: NSSTRING_WITHLEADBYTE_CP(&pUnicode, MSCP950_LEAD_CHAR, NSStringExt::c_anUnicodeMapCP950, NSStringExt::c_aoUnicodeMapCP950WithLeadByte, lCount, pData); break;
|
|
case SINGLE_BYTE_ENCODING_CP1361:NSSTRING_WITHLEADBYTE_CP(&pUnicode, MSCP1361_LEAD_CHAR, NSStringExt::c_anUnicodeMapCP1361, NSStringExt::c_aoUnicodeMapCP1361WithLeadByte, lCount, pData); break;
|
|
}
|
|
|
|
pUnicode[lCount] = 0;
|
|
std::wstring s(pUnicode);
|
|
|
|
if (pUnicode)
|
|
delete[] pUnicode;
|
|
|
|
return s;
|
|
}
|
|
std::wstring CConverter::GetUnicodeFromUTF16(const unsigned short* pData, long lCount)
|
|
{
|
|
if (0 == lCount)
|
|
return L"";
|
|
|
|
if (2 == sizeof(wchar_t))
|
|
return std::wstring((wchar_t*)pData, lCount);
|
|
|
|
wchar_t* pUnicode = new wchar_t[lCount + 1];
|
|
if (!pUnicode)
|
|
return L"";
|
|
|
|
wchar_t* pCur = pUnicode;
|
|
int nCurPos = 0;
|
|
while (nCurPos < lCount)
|
|
{
|
|
int nLeading = pData[nCurPos]; nCurPos++;
|
|
if (nLeading < 0xD800 || nLeading > 0xDFFF)
|
|
{
|
|
*pCur = (wchar_t)nLeading;
|
|
pCur++;
|
|
}
|
|
else
|
|
{
|
|
if (nCurPos >= lCount)
|
|
break;
|
|
|
|
int nTrailing = pData[nCurPos]; nCurPos++;
|
|
if (nTrailing >= 0xDC00 && nTrailing <= 0xDFFF)
|
|
{
|
|
*pCur = (wchar_t)(((nLeading & 0x03FF) << 10) | (nTrailing & 0x03FF));
|
|
*pCur += (wchar_t) (0x10000);
|
|
pCur++;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (0 == pCur - pUnicode)
|
|
return L"";
|
|
|
|
std::wstring sRet(pUnicode, pCur - pUnicode);
|
|
|
|
if (pUnicode)
|
|
delete[] pUnicode;
|
|
|
|
return sRet;
|
|
}
|
|
std::wstring CConverter::GetUnicodeFromUTF32(const unsigned int* pData, long lCount)
|
|
{
|
|
if (0 == lCount)
|
|
return L"";
|
|
|
|
if (4 == sizeof(wchar_t))
|
|
return std::wstring((wchar_t*)pData, lCount);
|
|
|
|
wchar_t* pUnicode = new wchar_t[2 * lCount + 1];
|
|
if (!pUnicode)
|
|
return L"";
|
|
|
|
wchar_t* pCur = pUnicode;
|
|
|
|
memset(pUnicode, 0x00, sizeof(wchar_t) * (2 * lCount + 1));
|
|
for (long lIndex = 0; lIndex < lCount; lIndex++)
|
|
{
|
|
unsigned int unUnicode = pData[lIndex];
|
|
if (unUnicode < 0x10000)
|
|
{
|
|
*pCur = unUnicode;
|
|
pCur++;
|
|
}
|
|
else
|
|
{
|
|
unUnicode = unUnicode - 0x10000;
|
|
*pCur = 0xD800 | (unUnicode >> 10);
|
|
pCur++;
|
|
*pCur = 0xDC00 | (unUnicode & 0x3FF);
|
|
pCur++;
|
|
}
|
|
}
|
|
|
|
if (0 == pCur - pUnicode)
|
|
return L"";
|
|
|
|
std::wstring sRet(pUnicode, pCur - pUnicode);
|
|
|
|
if (pUnicode)
|
|
delete[] pUnicode;
|
|
|
|
return sRet;
|
|
}
|
|
void CConverter::GetUtf8FromUTF32(const unsigned int* pUnicodes, long lCount, unsigned char*& pOutputData, long& lOutputCount)
|
|
{
|
|
pOutputData = new unsigned char[6 * lCount + 3 + 1];
|
|
unsigned char* pCodesCur = pOutputData;
|
|
|
|
for (int i = 0; i < lCount; i++)
|
|
{
|
|
unsigned int code = *pUnicodes++;
|
|
|
|
if (code < 0x80)
|
|
{
|
|
*pCodesCur++ = (unsigned char)code;
|
|
}
|
|
else if (code < 0x0800)
|
|
{
|
|
*pCodesCur++ = 0xC0 | (code >> 6);
|
|
*pCodesCur++ = 0x80 | (code & 0x3F);
|
|
}
|
|
else if (code < 0x10000)
|
|
{
|
|
*pCodesCur++ = 0xE0 | (code >> 12);
|
|
*pCodesCur++ = 0x80 | (code >> 6 & 0x3F);
|
|
*pCodesCur++ = 0x80 | (code & 0x3F);
|
|
}
|
|
else if (code < 0x1FFFFF)
|
|
{
|
|
*pCodesCur++ = 0xF0 | (code >> 18);
|
|
*pCodesCur++ = 0x80 | (code >> 12 & 0x3F);
|
|
*pCodesCur++ = 0x80 | (code >> 6 & 0x3F);
|
|
*pCodesCur++ = 0x80 | (code & 0x3F);
|
|
}
|
|
else if (code < 0x3FFFFFF)
|
|
{
|
|
*pCodesCur++ = 0xF8 | (code >> 24);
|
|
*pCodesCur++ = 0x80 | (code >> 18 & 0x3F);
|
|
*pCodesCur++ = 0x80 | (code >> 12 & 0x3F);
|
|
*pCodesCur++ = 0x80 | (code >> 6 & 0x3F);
|
|
*pCodesCur++ = 0x80 | (code & 0x3F);
|
|
}
|
|
else if (code < 0x7FFFFFFF)
|
|
{
|
|
*pCodesCur++ = 0xFC | (code >> 30);
|
|
*pCodesCur++ = 0x80 | (code >> 24 & 0x3F);
|
|
*pCodesCur++ = 0x80 | (code >> 18 & 0x3F);
|
|
*pCodesCur++ = 0x80 | (code >> 12 & 0x3F);
|
|
*pCodesCur++ = 0x80 | (code >> 6 & 0x3F);
|
|
*pCodesCur++ = 0x80 | (code & 0x3F);
|
|
}
|
|
}
|
|
|
|
lOutputCount = (long)(pCodesCur - pOutputData);
|
|
*pCodesCur++ = 0;
|
|
}
|
|
std::string CConverter::GetUtf8FromUTF32(const unsigned int* pUnicodes, long lCount)
|
|
{
|
|
unsigned char* pData;
|
|
long lOutputCount = 0;
|
|
|
|
GetUtf8FromUTF32(pUnicodes, lCount, pData, lOutputCount);
|
|
|
|
std::string s((char*)pData, lOutputCount);
|
|
delete [] pData;
|
|
return s;
|
|
}
|
|
unsigned int* CConverter::GetUtf32FromUnicode(const std::wstring& wsUnicodeText, unsigned int& unLen)
|
|
{
|
|
if (wsUnicodeText.size() <= 0)
|
|
return NULL;
|
|
|
|
unsigned int* pUnicodes = new unsigned int[wsUnicodeText.size()];
|
|
if (!pUnicodes)
|
|
return NULL;
|
|
|
|
unsigned int* pOutput = pUnicodes;
|
|
unLen = 0;
|
|
if (2 == sizeof(wchar_t))
|
|
{
|
|
const wchar_t* wsEnd = wsUnicodeText.c_str() + wsUnicodeText.size();
|
|
wchar_t* wsInput = (wchar_t*)wsUnicodeText.c_str();
|
|
|
|
wchar_t wLeading, wTrailing;
|
|
unsigned int unCode;
|
|
while (wsInput < wsEnd)
|
|
{
|
|
wLeading = *wsInput++;
|
|
if (wLeading < 0xD800 || wLeading > 0xDFFF)
|
|
{
|
|
pUnicodes[unLen++] = (unsigned int)wLeading;
|
|
}
|
|
else if (wLeading >= 0xDC00)
|
|
{
|
|
// Такого не должно быть
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
unCode = (wLeading & 0x3FF) << 10;
|
|
wTrailing = *wsInput++;
|
|
if (wTrailing < 0xDC00 || wTrailing > 0xDFFF)
|
|
{
|
|
// Такого не должно быть
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
pUnicodes[unLen++] = (unCode | (wTrailing & 0x3FF) + 0x10000);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
unLen = (unsigned int)wsUnicodeText.size();
|
|
for (unsigned int unIndex = 0; unIndex < unLen; unIndex++)
|
|
{
|
|
pUnicodes[unIndex] = (unsigned int)wsUnicodeText.at(unIndex);
|
|
}
|
|
}
|
|
|
|
return pUnicodes;
|
|
}
|
|
unsigned short* CConverter::GetUtf16FromUnicode(const std::wstring& wsUnicodeText, unsigned int& unLen, const bool& bIsLE)
|
|
{
|
|
unsigned int unTextLen = (unsigned int)wsUnicodeText.size();
|
|
if (unTextLen <= 0)
|
|
return NULL;
|
|
|
|
unsigned short* pUtf16 = NULL;
|
|
unLen = 0;
|
|
if (2 == sizeof(wchar_t))
|
|
{
|
|
pUtf16 = new unsigned short[unTextLen];
|
|
if (!pUtf16)
|
|
return NULL;
|
|
|
|
unLen = unTextLen;
|
|
for (unsigned int unIndex = 0; unIndex < unLen; unIndex++)
|
|
{
|
|
pUtf16[unIndex] = (unsigned short)wsUnicodeText.at(unIndex);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
pUtf16 = new unsigned short[2 * unTextLen + 1];
|
|
if (!pUtf16)
|
|
return NULL;
|
|
|
|
unsigned short* pCur = pUtf16;
|
|
memset(pUtf16, 0x00, sizeof(unsigned short) * (2 * unTextLen + 1));
|
|
|
|
for (unsigned int lIndex = 0; lIndex < unTextLen; lIndex++)
|
|
{
|
|
unsigned int unUnicode = wsUnicodeText.at(lIndex);
|
|
if (unUnicode < 0x10000)
|
|
{
|
|
*pCur = unUnicode;
|
|
pCur++;
|
|
}
|
|
else
|
|
{
|
|
unUnicode = unUnicode - 0x10000;
|
|
*pCur = 0xD800 | (unUnicode >> 10);
|
|
pCur++;
|
|
*pCur = 0xDC00 | (unUnicode & 0x3FF);
|
|
pCur++;
|
|
}
|
|
}
|
|
|
|
unLen = (unsigned int)(pCur - pUtf16);
|
|
if (!unLen)
|
|
{
|
|
delete[] pUtf16;
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
if (!bIsLE)
|
|
{
|
|
unsigned char* pDataReverce = (unsigned char*)pUtf16;
|
|
unsigned int unLen2 = unLen << 1;
|
|
for (unsigned int i = 0; i < unLen2; i += 2)
|
|
{
|
|
unsigned char tmp = pDataReverce[i];
|
|
pDataReverce[i] = pDataReverce[i + 1];
|
|
pDataReverce[i + 1] = tmp;
|
|
}
|
|
}
|
|
|
|
return pUtf16;
|
|
}
|
|
|
|
class CStringUnicodeIterator_private
|
|
{
|
|
public:
|
|
const wchar_t* m_str;
|
|
size_t m_str_len;
|
|
size_t m_index;
|
|
|
|
public:
|
|
CStringUnicodeIterator_private(const std::wstring& str)
|
|
{
|
|
m_str = str.c_str();
|
|
m_str_len = str.length();
|
|
m_index = 0;
|
|
}
|
|
|
|
inline bool IsLeadingSurrogateChar(const unsigned int& nCharCode)
|
|
{
|
|
return (nCharCode >= 0xD800 && nCharCode <= 0xDFFF);
|
|
}
|
|
inline unsigned int DecodeSurrogateChar(const unsigned int& nLeadingChar, const unsigned int& nTrailingChar)
|
|
{
|
|
if (nLeadingChar < 0xDC00 && nTrailingChar >= 0xDC00 && nTrailingChar <= 0xDFFF)
|
|
return 0x10000 + ((nLeadingChar & 0x3FF) << 10) | (nTrailingChar & 0x3FF);
|
|
else
|
|
return 0;
|
|
}
|
|
};
|
|
|
|
CStringUnicodeIterator::CStringUnicodeIterator(const std::wstring& string)
|
|
{
|
|
m_internal = new CStringUnicodeIterator_private(string);
|
|
}
|
|
CStringUnicodeIterator::~CStringUnicodeIterator()
|
|
{
|
|
delete m_internal;
|
|
}
|
|
|
|
bool CStringUnicodeIterator::Check()
|
|
{
|
|
return (m_internal->m_index < m_internal->m_str_len) ? true : false;
|
|
}
|
|
|
|
void CStringUnicodeIterator::Next()
|
|
{
|
|
if (this->m_internal->m_index >= this->m_internal->m_str_len)
|
|
return;
|
|
|
|
if (2 != sizeof(wchar_t))
|
|
{
|
|
m_internal->m_index++;
|
|
}
|
|
else
|
|
{
|
|
if (!m_internal->IsLeadingSurrogateChar((unsigned int)m_internal->m_str[m_internal->m_index]))
|
|
{
|
|
m_internal->m_index++;
|
|
return;
|
|
}
|
|
m_internal->m_index += 2;
|
|
}
|
|
}
|
|
|
|
unsigned int CStringUnicodeIterator::Value()
|
|
{
|
|
if (m_internal->m_index >= m_internal->m_str_len)
|
|
return 0;
|
|
|
|
if (2 != sizeof(wchar_t))
|
|
{
|
|
return (unsigned int)m_internal->m_str[m_internal->m_index];
|
|
}
|
|
else
|
|
{
|
|
unsigned int nCharCode = (unsigned int)m_internal->m_str[m_internal->m_index];
|
|
if (!m_internal->IsLeadingSurrogateChar(nCharCode))
|
|
return nCharCode;
|
|
|
|
if (m_internal->m_index == (m_internal->m_str_len - 1))
|
|
return nCharCode;
|
|
|
|
unsigned int nTrailingChar = (unsigned int)m_internal->m_str[m_internal->m_index + 1];
|
|
return m_internal->DecodeSurrogateChar(nCharCode, nTrailingChar);
|
|
}
|
|
}
|
|
}
|