Files
DocumentServer-v-9.2.0/core/OOXML/Binary/Sheets/Reader/CSVReader.cpp
Yajbir Singh f1b860b25c
Some checks failed
check / markdownlint (push) Has been cancelled
check / spellchecker (push) Has been cancelled
updated
2025-12-11 19:03:17 +05:30

531 lines
16 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* (c) Copyright Ascensio System SIA 2010-2023
*
* This program is a free software product. You can redistribute it and/or
* modify it under the terms of the GNU Affero General Public License (AGPL)
* version 3 as published by the Free Software Foundation. In accordance with
* Section 7(a) of the GNU AGPL its Section 15 shall be amended to the effect
* that Ascensio System SIA expressly excludes the warranty of non-infringement
* of any third-party rights.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. For
* details, see the GNU AGPL at: http://www.gnu.org/licenses/agpl-3.0.html
*
* You can contact Ascensio System SIA at 20A-6 Ernesta Birznieka-Upish
* street, Riga, Latvia, EU, LV-1050.
*
* The interactive user interfaces in modified source and object code versions
* of the Program must display Appropriate Legal Notices, as required under
* Section 5 of the GNU AGPL version 3.
*
* Pursuant to Section 7(b) of the License you must retain the original Product
* logo when distributing the program. Pursuant to Section 7(e) we decline to
* grant you any rights under trademark law for use of our trademarks.
*
* All the Product's GUI elements, including illustrations and icon sets, as
* well as technical writing content are licensed under the terms of the
* Creative Commons Attribution-ShareAlike 4.0 International. See the License
* terms at http://creativecommons.org/licenses/by-sa/4.0/legalcode
*
*/
#include "CSVReader.h"
#include "CellFormatController/CellFormatController.h"
#include "../../../XlsbFormat/Xlsb.h"
#include "../../../XlsxFormat/Worksheets/Worksheet.h"
#include "../../../../DesktopEditor/common/File.h"
#include <map>
#include <locale>
#include "../../../../DesktopEditor/common/File.h"
#include "../../../Base/unicode_util.h"
#include "../../../../Common/OfficeFileErrorDescription.h"
#include "../../../../UnicodeConverter/UnicodeConverter.h"
#include "../../../../UnicodeConverter/UnicodeConverter_Encodings.h"
#include "../../../XlsxFormat/Workbook/Workbook.h"
#include "../../../XlsxFormat/SharedStrings/SharedStrings.h"
#include "../../../XlsxFormat/Styles/Styles.h"
#include "../../../XlsxFormat/Styles/NumFmts.h"
#include "../../../XlsxFormat/Styles/Xfs.h"
class CSVReader::Impl
{
public:
Impl() {}
_UINT32 Read(const std::wstring &sFileName, OOX::Spreadsheet::CXlsx &oXlsx, _UINT32 nCodePage, const std::wstring& wcDelimiter, _INT32 lcid, bool readToCache);
private:
int AddCell(std::wstring &sText, INT nStartCell, std::stack<INT> &oDeleteChars, OOX::Spreadsheet::CRow &oRow, INT nRow, INT nCol, bool bIsWrap);
std::shared_ptr<CellFormatController> cellFormatController_ = NULL;
//---------------------------------------------------------------------------------------------------------
const std::wstring ansi_2_unicode(const unsigned char* data, DWORD data_size)
{
std::wstring result;
std::locale loc("");
std::ctype<wchar_t> const &facet = std::use_facet<std::ctype<wchar_t> >(loc);
result.resize(data_size);
facet.widen((char*)data, (char*)data + data_size, &result[0]);
return result;
}
void utf8_2_unicode(const unsigned char* data, DWORD data_size, std::wstring &wStr)
{
wStr.resize(data_size + 1);
unsigned int nLength = data_size;
ConversionResult eUnicodeConversionResult = conversionOK;
if (sizeof(wchar_t) == 2)//utf8 -> utf16
{
memset ((void *)wStr.data(), 0, sizeof (UTF16) * (nLength + 1));
UTF8 *pStrUtf8 = (UTF8 *) data;
const UTF8 *pStrUtf8_Conv = pStrUtf8;
UTF16 *pStrUtf16_Conv = (UTF16 *)wStr.data();
eUnicodeConversionResult = ConvertUTF8toUTF16 (&pStrUtf8_Conv, &pStrUtf8[nLength]
, &pStrUtf16_Conv, &((UTF16 *)wStr.data())[nLength]
, strictConversion);
}
else //utf8 -> utf32
{
//UTF32 *pStrUtf32 = new UTF32 [nLength + 1];
//memset ((void *) pStrUtf32, 0, sizeof (UTF32) * (nLength + 1));
memset((void *)wStr.data(), 0, sizeof(UTF16) * (nLength + 1));
UTF8 *pStrUtf8 = (UTF8 *) data;
const UTF8 *pStrUtf8_Conv = pStrUtf8;
UTF32 *pStrUtf32_Conv = (UTF32 *)wStr.data();
eUnicodeConversionResult = ConvertUTF8toUTF32 (&pStrUtf8_Conv, &pStrUtf8[nLength]
, &pStrUtf32_Conv, &((UTF32 *)wStr.data())[nLength]
, strictConversion);
}
if (conversionOK != eUnicodeConversionResult)
{
wStr.clear();
std::string inp((char*)data, data_size);
wStr = std::wstring(inp.begin(), inp.end());
}
}
const std::wstring utf16_2_unicode(const unsigned char* data, DWORD data_size)
{
if (sizeof(wchar_t) == 2)//utf16 -> utf16
{
return std::wstring((wchar_t*)data, data_size / 2);
}
else //utf16 -> utf32
{
unsigned int nLength = data_size / 2;
UTF32 *pStrUtf32 = new UTF32 [nLength + 1];
memset ((void *) pStrUtf32, 0, sizeof (UTF32) * (nLength + 1));
UTF16 *pStrUtf16 = (UTF16 *) data;
const UTF16 *pStrUtf16_Conv = pStrUtf16;
UTF32 *pStrUtf32_Conv = pStrUtf32;
ConversionResult eUnicodeConversionResult = ConvertUTF16toUTF32 (&pStrUtf16_Conv, &pStrUtf16[nLength]
, &pStrUtf32_Conv, &pStrUtf32 [nLength]
, strictConversion);
if (conversionOK != eUnicodeConversionResult)
{
delete [] pStrUtf32;
return std::wstring();
}
std::wstring utf32Str ((wchar_t *) pStrUtf32);
delete [] pStrUtf32;
return utf32Str;
}
}
const std::wstring utf32_2_unicode(const unsigned char* data, DWORD data_size)
{
if (sizeof(wchar_t) == 4)//utf32 -> utf32
{
return std::wstring((wchar_t*)data, data_size / 4);
}
else //utf32 -> utf16
{
unsigned int nLength = data_size / 4;
UTF16 *pStrUtf16 = new UTF16 [nLength + 1];
memset ((void *) pStrUtf16, 0, sizeof (UTF16) * (nLength + 1));
UTF32 *pStrUtf32 = (UTF32 *) data;
const UTF32 *pStrUtf32_Conv = pStrUtf32;
UTF16 *pStrUtf16_Conv = pStrUtf16;
ConversionResult eUnicodeConversionResult = ConvertUTF32toUTF16 (&pStrUtf32_Conv, &pStrUtf32[nLength]
, &pStrUtf16_Conv, &pStrUtf16 [nLength]
, strictConversion);
if (conversionOK != eUnicodeConversionResult)
{
delete [] pStrUtf16;
return std::wstring();
}
std::wstring utf16Str ((wchar_t *) pStrUtf16);
delete [] pStrUtf16;
return utf16Str;
}
}
};
//-----------------------------------------------------------------------------------------------
int CSVReader::Impl::AddCell(std::wstring &sText, INT nStartCell, std::stack<INT> &oDeleteChars, OOX::Spreadsheet::CRow &oRow, INT nRow, INT nCol, bool bIsWrap)
{
int result = 0;
while (!oDeleteChars.empty())
{
INT nIndex = oDeleteChars.top() - nStartCell;
sText.erase(nIndex, 1);
oDeleteChars.pop();
}
size_t length = sText.length();
// Пустую не пишем
if ((0 == length) || (sText[0] == L'\0'))
return result;
OOX::Spreadsheet::CCell *pCell = new OOX::Spreadsheet::CCell();
pCell->m_oType.Init();
pCell->m_oCacheValue = sText; // как есть
pCell->setRowCol(nRow, nCol);
result = cellFormatController_->ProcessCellType(pCell, sText, bIsWrap);
oRow.m_arrItems.push_back(pCell);
return result;
}
_UINT32 CSVReader::Impl::Read(const std::wstring &sFileName, OOX::Spreadsheet::CXlsx &oXlsx, _UINT32 nCodePage, const std::wstring& sDelimiter, _INT32 lcid, bool readToCache)
{
NSFile::CFileBinary oFile;
if (false == oFile.OpenFile(sFileName)) return AVS_FILEUTILS_ERROR_CONVERT;
//-----------------------------------------------------------------------------------
// Создадим Workbook
oXlsx.CreateWorkbook();
// Создадим стили
oXlsx.CreateStyles();
cellFormatController_ = std::make_shared<CellFormatController>(oXlsx.m_pStyles, lcid);
smart_ptr<OOX::Spreadsheet::CWorksheet> pWorksheet(new OOX::Spreadsheet::CWorksheet(NULL));
pWorksheet->m_oSheetData.Init();
pWorksheet->m_oSheetFormatPr.Init();
pWorksheet->m_oSheetFormatPr->m_oBaseColWidth = 9;
cellFormatController_->m_pWorksheet = pWorksheet.GetPointer();
//-----------------------------------------------------------------------------------
DWORD nFileSize = 0;
BYTE* pFileData = new BYTE[oFile.GetFileSize() + 64];
oFile.ReadFile(pFileData, oFile.GetFileSize(), nFileSize);
oFile.CloseFile();
//skip bom
DWORD nInputBufferSize = nFileSize;
BYTE* pInputBuffer = pFileData;
if (nInputBufferSize >= 3 && 0xef == pInputBuffer[0] && 0xbb == pInputBuffer[1] && 0xbf == pInputBuffer[2])
{
nInputBufferSize -= 3;
pInputBuffer += 3;
}
else if (nInputBufferSize >= 2 && ((0xfe == pInputBuffer[0] && 0xff == pInputBuffer[1]) || (0xff == pInputBuffer[0] && 0xfe == pInputBuffer[1])))
{
nInputBufferSize -= 2;
pInputBuffer += 2;
}
std::wstring sFileDataW;
if (nCodePage == 1000)
{
sFileDataW = ansi_2_unicode(pInputBuffer, nInputBufferSize);
}
else if (nCodePage == 46)//utf-8
{
utf8_2_unicode(pInputBuffer, nInputBufferSize, sFileDataW);
}
else if (nCodePage == 48)//utf-16
{
sFileDataW = utf16_2_unicode(pInputBuffer, nInputBufferSize);
}
else if (nCodePage == 50) // utf-32
{
sFileDataW = utf32_2_unicode(pInputBuffer, nInputBufferSize);
}
else
{
const NSUnicodeConverter::EncodindId& oEncodindId = NSUnicodeConverter::Encodings[nCodePage];
NSUnicodeConverter::CUnicodeConverter oUnicodeConverter;
sFileDataW = oUnicodeConverter.toUnicode((const char*)pInputBuffer, nInputBufferSize, oEncodindId.Name);
}
//------------------------------------------------------------------------------------------------------------------------------
delete[]pFileData;
size_t nSize = sFileDataW.length();
if (nSize < 1 && nInputBufferSize > 0)
{//для синхронности вывода превью и нормального результата
const NSUnicodeConverter::EncodindId& oEncodindId = NSUnicodeConverter::Encodings[nCodePage];
NSUnicodeConverter::CUnicodeConverter oUnicodeConverter;
sFileDataW = oUnicodeConverter.toUnicode((const char*)pInputBuffer, nInputBufferSize, oEncodindId.Name);
nSize = sFileDataW.length();
//return AVS_FILEUTILS_ERROR_CONVERT_ICU;
}
WCHAR wcDelimiterLeading = L'\0';
WCHAR wcDelimiterTrailing = L'\0';
int nDelimiterSize = 0;
if(sFileDataW.size() > 7 && sFileDataW.substr(0, 4) == L"sep=")
{
wcDelimiterLeading = sFileDataW[4];
nDelimiterSize = 1;
if (2 == sizeof(wchar_t) && 0xD800 <= wcDelimiterLeading && wcDelimiterLeading <= 0xDBFF &&( sFileDataW[5] != L'\r' || sFileDataW[5] != L'\n'))
{
wcDelimiterTrailing = sFileDataW[5];
nDelimiterSize = 2;
}
auto newPos = 4 + nDelimiterSize;
if(sFileDataW[newPos] == L'\r' || sFileDataW[newPos] == L'\n')
{
newPos++;
if(sFileDataW[newPos] == L'\r' || sFileDataW[newPos] == L'\n')
newPos++;
}
sFileDataW.erase(0, newPos);
nSize = sFileDataW.length();
}
else if (sDelimiter.length() > 0)
{
wcDelimiterLeading = sDelimiter[0];
nDelimiterSize = 1;
if (2 == sizeof(wchar_t) && 0xD800 <= wcDelimiterLeading && wcDelimiterLeading <= 0xDBFF && sDelimiter.length() > 1)
{
wcDelimiterTrailing = sDelimiter[1];
nDelimiterSize = 2;
}
}
const WCHAR wcNewLineN = _T('\n');
const WCHAR wcNewLineR = _T('\r');
const WCHAR wcQuote = _T('"');
const WCHAR wcTab = _T('\t');
bool bIsWrap = false;
WCHAR wcCurrent;
INT nStartCell = 0;
std::stack<INT> oDeleteChars;
bool bMsLimit = false;
bool bMsLimitCell = false;
bool bInQuote = false;
INT nIndexRow = 0;
INT nIndexCol = 0;
OOX::Spreadsheet::CRow *pRow = new OOX::Spreadsheet::CRow();
pRow->m_oR.Init();
pRow->m_oR->SetValue(nIndexRow + 1);
const WCHAR *pTemp = sFileDataW.c_str();
for (size_t nIndex = 0; nIndex < nSize; ++nIndex)
{
wcCurrent = pTemp[nIndex];
if (wcDelimiterLeading == wcCurrent && (L'\0' == wcDelimiterTrailing || (nIndex + 1 < nSize && wcDelimiterTrailing == pTemp[nIndex + 1])))
{
if (bInQuote)
continue;
// New Cell
std::wstring sCellText(pTemp + nStartCell, nIndex - nStartCell);
if (1 == AddCell(sCellText, nStartCell, oDeleteChars, *pRow, nIndexRow, nIndexCol++, bIsWrap))
{
bMsLimitCell = true;
}
oDeleteChars = std::stack<INT>();
bIsWrap = false;
if (nIndex + nDelimiterSize == nSize)
{ if(readToCache)
{
pRow->storeXmlCache();
pWorksheet->m_oSheetData->AddRowToCache(*pRow);
delete pRow;
pRow = NULL;
}
else
{
pWorksheet->m_oSheetData->m_arrItems.push_back(pRow);
pRow = NULL;
}
}
if (nIndex + nDelimiterSize > 500000)
{
nStartCell = 0;
sFileDataW.erase(0, nIndex + nDelimiterSize);
nSize -= (nIndex + nDelimiterSize); nIndex = 0;
pTemp = sFileDataW.c_str();
}
else
nStartCell = nIndex + nDelimiterSize;
}
else if (wcNewLineN == wcCurrent || wcNewLineR == wcCurrent)
{
if (bInQuote)
{
// Добавим Wrap
bIsWrap = true;
continue;
}
// New line
if (nStartCell != nIndex)
{
std::wstring sCellText(pTemp + nStartCell, nIndex - nStartCell);
if (1 == AddCell(sCellText, nStartCell, oDeleteChars, *pRow, nIndexRow, nIndexCol++, bIsWrap))
{
bMsLimitCell = true;
}
bIsWrap = false;
}
if (wcNewLineR == wcCurrent && nIndex + 1 != nSize && wcNewLineN == pTemp[nIndex + 1])
{
// На комбинацию \r\n должен быть только 1 перенос
++nIndex;
}
if (nIndex + 1 > 500000)
{
nStartCell = 0;
sFileDataW.erase(0, nIndex + 1);
nSize -= (nIndex + 1); nIndex = 0;
pTemp = sFileDataW.c_str();
}
else
nStartCell = nIndex + 1;
if(readToCache)
{
pRow->storeXmlCache();
pWorksheet->m_oSheetData->AddRowToCache(*pRow);
delete pRow;
pRow = NULL;
}
else
pWorksheet->m_oSheetData->m_arrItems.push_back(pRow);
pRow = new OOX::Spreadsheet::CRow();
pRow->m_oR.Init();
pRow->m_oR->SetValue(++nIndexRow + 1);
nIndexCol = 0;
if (pWorksheet->m_oSheetData->m_arrItems.size() > 1048576)
{
bMsLimit = true;
break; // ограниечние мс
}
}
else if (wcQuote == wcCurrent)
{
// Quote
if (false == bInQuote && nStartCell == nIndex && nIndex + 1 != nSize)
{
// Начало новой ячейки (только если мы сразу после разделителя и не в конце файла)
bInQuote = !bInQuote;
nStartCell = nIndex + 1;
}
else if (bInQuote)
{
// Нужно удалить кавычку ограничитель
oDeleteChars.push(nIndex);
// Если следующий символ кавычка, то мы не закончили ограничитель строки (1997,Ford,E350,"Super, ""luxurious"" truck")
if (nIndex + 1 != nSize && wcQuote == pTemp[nIndex + 1])
++nIndex;
else
bInQuote = !bInQuote;
}
}
else if (wcTab == wcCurrent)
{
// delete tab if not delimiter
oDeleteChars.push(nIndex);
}
}
if (nStartCell != nSize && !bMsLimit)
{
while (nSize > 0)
{
if (pTemp[nSize - 1] != L'\0')
break;
else nSize--;
}
std::wstring sCellText(pTemp + nStartCell, nSize - nStartCell);
if (1 == AddCell(sCellText, nStartCell, oDeleteChars, *pRow, nIndexRow, nIndexCol++, bIsWrap))
{
bMsLimitCell = true;
}
if(readToCache)
{
pRow->storeXmlCache();
pWorksheet->m_oSheetData->AddRowToCache(*pRow);
delete pRow;
pRow = NULL;
}
else
pWorksheet->m_oSheetData->m_arrItems.push_back(pRow);
}
else
{
RELEASEOBJECT(pRow);
}
oXlsx.m_arWorksheets.push_back(pWorksheet.GetPointer());
smart_ptr<OOX::File> oWorksheetFile = pWorksheet.smart_dynamic_cast<OOX::File>();
const OOX::RId oRid = oXlsx.m_pWorkbook->Add(oWorksheetFile);
oXlsx.m_mapWorksheets.insert(std::make_pair(oRid.ToString(), pWorksheet.GetPointer())); // for bin
OOX::Spreadsheet::CSheet *pSheet = new OOX::Spreadsheet::CSheet();
pSheet->m_oName = L"Sheet1";
pSheet->m_oSheetId.Init();
pSheet->m_oSheetId->SetValue(1);
pSheet->m_oRid.Init();
pSheet->m_oRid->SetValue(oRid.ToString());
oXlsx.m_pWorkbook->m_oSheets.Init();
oXlsx.m_pWorkbook->m_oSheets->m_arrItems.push_back(pSheet);
return bMsLimit ? AVS_FILEUTILS_ERROR_CONVERT_ROWLIMITS : (bMsLimitCell ? AVS_FILEUTILS_ERROR_CONVERT_CELLLIMITS : 0);
}
//----------------------------------------------------------------------------------
CSVReader::CSVReader() : impl_(new CSVReader::Impl())
{
}
CSVReader::~CSVReader()
{
}
_UINT32 CSVReader::Read(const std::wstring &sFileName, OOX::Spreadsheet::CXlsx &oXlsx, _UINT32 nCodePage, const std::wstring& sDelimiter, _INT32 lcid)
{
return impl_->Read(sFileName, oXlsx, nCodePage, sDelimiter, lcid, readToxmlCache_);
}