/* * (c) Copyright Ascensio System SIA 2010-2023 * * This program is a free software product. You can redistribute it and/or * modify it under the terms of the GNU Affero General Public License (AGPL) * version 3 as published by the Free Software Foundation. In accordance with * Section 7(a) of the GNU AGPL its Section 15 shall be amended to the effect * that Ascensio System SIA expressly excludes the warranty of non-infringement * of any third-party rights. * * This program is distributed WITHOUT ANY WARRANTY; without even the implied * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. For * details, see the GNU AGPL at: http://www.gnu.org/licenses/agpl-3.0.html * * You can contact Ascensio System SIA at 20A-6 Ernesta Birznieka-Upish * street, Riga, Latvia, EU, LV-1050. * * The interactive user interfaces in modified source and object code versions * of the Program must display Appropriate Legal Notices, as required under * Section 5 of the GNU AGPL version 3. * * Pursuant to Section 7(b) of the License you must retain the original Product * logo when distributing the program. Pursuant to Section 7(e) we decline to * grant you any rights under trademark law for use of our trademarks. * * All the Product's GUI elements, including illustrations and icon sets, as * well as technical writing content are licensed under the terms of the * Creative Commons Attribution-ShareAlike 4.0 International. See the License * terms at http://creativecommons.org/licenses/by-sa/4.0/legalcode * */ #include "CSVReader.h" #include "CellFormatController/CellFormatController.h" #include "../../../XlsbFormat/Xlsb.h" #include "../../../XlsxFormat/Worksheets/Worksheet.h" #include "../../../../DesktopEditor/common/File.h" #include #include #include "../../../../DesktopEditor/common/File.h" #include "../../../Base/unicode_util.h" #include "../../../../Common/OfficeFileErrorDescription.h" #include "../../../../UnicodeConverter/UnicodeConverter.h" #include "../../../../UnicodeConverter/UnicodeConverter_Encodings.h" #include "../../../XlsxFormat/Workbook/Workbook.h" #include "../../../XlsxFormat/SharedStrings/SharedStrings.h" #include "../../../XlsxFormat/Styles/Styles.h" #include "../../../XlsxFormat/Styles/NumFmts.h" #include "../../../XlsxFormat/Styles/Xfs.h" class CSVReader::Impl { public: Impl() {} _UINT32 Read(const std::wstring &sFileName, OOX::Spreadsheet::CXlsx &oXlsx, _UINT32 nCodePage, const std::wstring& wcDelimiter, _INT32 lcid, bool readToCache); private: int AddCell(std::wstring &sText, INT nStartCell, std::stack &oDeleteChars, OOX::Spreadsheet::CRow &oRow, INT nRow, INT nCol, bool bIsWrap); std::shared_ptr cellFormatController_ = NULL; //--------------------------------------------------------------------------------------------------------- const std::wstring ansi_2_unicode(const unsigned char* data, DWORD data_size) { std::wstring result; std::locale loc(""); std::ctype const &facet = std::use_facet >(loc); result.resize(data_size); facet.widen((char*)data, (char*)data + data_size, &result[0]); return result; } void utf8_2_unicode(const unsigned char* data, DWORD data_size, std::wstring &wStr) { wStr.resize(data_size + 1); unsigned int nLength = data_size; ConversionResult eUnicodeConversionResult = conversionOK; if (sizeof(wchar_t) == 2)//utf8 -> utf16 { memset ((void *)wStr.data(), 0, sizeof (UTF16) * (nLength + 1)); UTF8 *pStrUtf8 = (UTF8 *) data; const UTF8 *pStrUtf8_Conv = pStrUtf8; UTF16 *pStrUtf16_Conv = (UTF16 *)wStr.data(); eUnicodeConversionResult = ConvertUTF8toUTF16 (&pStrUtf8_Conv, &pStrUtf8[nLength] , &pStrUtf16_Conv, &((UTF16 *)wStr.data())[nLength] , strictConversion); } else //utf8 -> utf32 { //UTF32 *pStrUtf32 = new UTF32 [nLength + 1]; //memset ((void *) pStrUtf32, 0, sizeof (UTF32) * (nLength + 1)); memset((void *)wStr.data(), 0, sizeof(UTF16) * (nLength + 1)); UTF8 *pStrUtf8 = (UTF8 *) data; const UTF8 *pStrUtf8_Conv = pStrUtf8; UTF32 *pStrUtf32_Conv = (UTF32 *)wStr.data(); eUnicodeConversionResult = ConvertUTF8toUTF32 (&pStrUtf8_Conv, &pStrUtf8[nLength] , &pStrUtf32_Conv, &((UTF32 *)wStr.data())[nLength] , strictConversion); } if (conversionOK != eUnicodeConversionResult) { wStr.clear(); std::string inp((char*)data, data_size); wStr = std::wstring(inp.begin(), inp.end()); } } const std::wstring utf16_2_unicode(const unsigned char* data, DWORD data_size) { if (sizeof(wchar_t) == 2)//utf16 -> utf16 { return std::wstring((wchar_t*)data, data_size / 2); } else //utf16 -> utf32 { unsigned int nLength = data_size / 2; UTF32 *pStrUtf32 = new UTF32 [nLength + 1]; memset ((void *) pStrUtf32, 0, sizeof (UTF32) * (nLength + 1)); UTF16 *pStrUtf16 = (UTF16 *) data; const UTF16 *pStrUtf16_Conv = pStrUtf16; UTF32 *pStrUtf32_Conv = pStrUtf32; ConversionResult eUnicodeConversionResult = ConvertUTF16toUTF32 (&pStrUtf16_Conv, &pStrUtf16[nLength] , &pStrUtf32_Conv, &pStrUtf32 [nLength] , strictConversion); if (conversionOK != eUnicodeConversionResult) { delete [] pStrUtf32; return std::wstring(); } std::wstring utf32Str ((wchar_t *) pStrUtf32); delete [] pStrUtf32; return utf32Str; } } const std::wstring utf32_2_unicode(const unsigned char* data, DWORD data_size) { if (sizeof(wchar_t) == 4)//utf32 -> utf32 { return std::wstring((wchar_t*)data, data_size / 4); } else //utf32 -> utf16 { unsigned int nLength = data_size / 4; UTF16 *pStrUtf16 = new UTF16 [nLength + 1]; memset ((void *) pStrUtf16, 0, sizeof (UTF16) * (nLength + 1)); UTF32 *pStrUtf32 = (UTF32 *) data; const UTF32 *pStrUtf32_Conv = pStrUtf32; UTF16 *pStrUtf16_Conv = pStrUtf16; ConversionResult eUnicodeConversionResult = ConvertUTF32toUTF16 (&pStrUtf32_Conv, &pStrUtf32[nLength] , &pStrUtf16_Conv, &pStrUtf16 [nLength] , strictConversion); if (conversionOK != eUnicodeConversionResult) { delete [] pStrUtf16; return std::wstring(); } std::wstring utf16Str ((wchar_t *) pStrUtf16); delete [] pStrUtf16; return utf16Str; } } }; //----------------------------------------------------------------------------------------------- int CSVReader::Impl::AddCell(std::wstring &sText, INT nStartCell, std::stack &oDeleteChars, OOX::Spreadsheet::CRow &oRow, INT nRow, INT nCol, bool bIsWrap) { int result = 0; while (!oDeleteChars.empty()) { INT nIndex = oDeleteChars.top() - nStartCell; sText.erase(nIndex, 1); oDeleteChars.pop(); } size_t length = sText.length(); // Пустую не пишем if ((0 == length) || (sText[0] == L'\0')) return result; OOX::Spreadsheet::CCell *pCell = new OOX::Spreadsheet::CCell(); pCell->m_oType.Init(); pCell->m_oCacheValue = sText; // как есть pCell->setRowCol(nRow, nCol); result = cellFormatController_->ProcessCellType(pCell, sText, bIsWrap); oRow.m_arrItems.push_back(pCell); return result; } _UINT32 CSVReader::Impl::Read(const std::wstring &sFileName, OOX::Spreadsheet::CXlsx &oXlsx, _UINT32 nCodePage, const std::wstring& sDelimiter, _INT32 lcid, bool readToCache) { NSFile::CFileBinary oFile; if (false == oFile.OpenFile(sFileName)) return AVS_FILEUTILS_ERROR_CONVERT; //----------------------------------------------------------------------------------- // Создадим Workbook oXlsx.CreateWorkbook(); // Создадим стили oXlsx.CreateStyles(); cellFormatController_ = std::make_shared(oXlsx.m_pStyles, lcid); smart_ptr pWorksheet(new OOX::Spreadsheet::CWorksheet(NULL)); pWorksheet->m_oSheetData.Init(); pWorksheet->m_oSheetFormatPr.Init(); pWorksheet->m_oSheetFormatPr->m_oBaseColWidth = 9; cellFormatController_->m_pWorksheet = pWorksheet.GetPointer(); //----------------------------------------------------------------------------------- DWORD nFileSize = 0; BYTE* pFileData = new BYTE[oFile.GetFileSize() + 64]; oFile.ReadFile(pFileData, oFile.GetFileSize(), nFileSize); oFile.CloseFile(); //skip bom DWORD nInputBufferSize = nFileSize; BYTE* pInputBuffer = pFileData; if (nInputBufferSize >= 3 && 0xef == pInputBuffer[0] && 0xbb == pInputBuffer[1] && 0xbf == pInputBuffer[2]) { nInputBufferSize -= 3; pInputBuffer += 3; } else if (nInputBufferSize >= 2 && ((0xfe == pInputBuffer[0] && 0xff == pInputBuffer[1]) || (0xff == pInputBuffer[0] && 0xfe == pInputBuffer[1]))) { nInputBufferSize -= 2; pInputBuffer += 2; } std::wstring sFileDataW; if (nCodePage == 1000) { sFileDataW = ansi_2_unicode(pInputBuffer, nInputBufferSize); } else if (nCodePage == 46)//utf-8 { utf8_2_unicode(pInputBuffer, nInputBufferSize, sFileDataW); } else if (nCodePage == 48)//utf-16 { sFileDataW = utf16_2_unicode(pInputBuffer, nInputBufferSize); } else if (nCodePage == 50) // utf-32 { sFileDataW = utf32_2_unicode(pInputBuffer, nInputBufferSize); } else { const NSUnicodeConverter::EncodindId& oEncodindId = NSUnicodeConverter::Encodings[nCodePage]; NSUnicodeConverter::CUnicodeConverter oUnicodeConverter; sFileDataW = oUnicodeConverter.toUnicode((const char*)pInputBuffer, nInputBufferSize, oEncodindId.Name); } //------------------------------------------------------------------------------------------------------------------------------ delete[]pFileData; size_t nSize = sFileDataW.length(); if (nSize < 1 && nInputBufferSize > 0) {//для синхронности вывода превью и нормального результата const NSUnicodeConverter::EncodindId& oEncodindId = NSUnicodeConverter::Encodings[nCodePage]; NSUnicodeConverter::CUnicodeConverter oUnicodeConverter; sFileDataW = oUnicodeConverter.toUnicode((const char*)pInputBuffer, nInputBufferSize, oEncodindId.Name); nSize = sFileDataW.length(); //return AVS_FILEUTILS_ERROR_CONVERT_ICU; } WCHAR wcDelimiterLeading = L'\0'; WCHAR wcDelimiterTrailing = L'\0'; int nDelimiterSize = 0; if(sFileDataW.size() > 7 && sFileDataW.substr(0, 4) == L"sep=") { wcDelimiterLeading = sFileDataW[4]; nDelimiterSize = 1; if (2 == sizeof(wchar_t) && 0xD800 <= wcDelimiterLeading && wcDelimiterLeading <= 0xDBFF &&( sFileDataW[5] != L'\r' || sFileDataW[5] != L'\n')) { wcDelimiterTrailing = sFileDataW[5]; nDelimiterSize = 2; } auto newPos = 4 + nDelimiterSize; if(sFileDataW[newPos] == L'\r' || sFileDataW[newPos] == L'\n') { newPos++; if(sFileDataW[newPos] == L'\r' || sFileDataW[newPos] == L'\n') newPos++; } sFileDataW.erase(0, newPos); nSize = sFileDataW.length(); } else if (sDelimiter.length() > 0) { wcDelimiterLeading = sDelimiter[0]; nDelimiterSize = 1; if (2 == sizeof(wchar_t) && 0xD800 <= wcDelimiterLeading && wcDelimiterLeading <= 0xDBFF && sDelimiter.length() > 1) { wcDelimiterTrailing = sDelimiter[1]; nDelimiterSize = 2; } } const WCHAR wcNewLineN = _T('\n'); const WCHAR wcNewLineR = _T('\r'); const WCHAR wcQuote = _T('"'); const WCHAR wcTab = _T('\t'); bool bIsWrap = false; WCHAR wcCurrent; INT nStartCell = 0; std::stack oDeleteChars; bool bMsLimit = false; bool bMsLimitCell = false; bool bInQuote = false; INT nIndexRow = 0; INT nIndexCol = 0; OOX::Spreadsheet::CRow *pRow = new OOX::Spreadsheet::CRow(); pRow->m_oR.Init(); pRow->m_oR->SetValue(nIndexRow + 1); const WCHAR *pTemp = sFileDataW.c_str(); for (size_t nIndex = 0; nIndex < nSize; ++nIndex) { wcCurrent = pTemp[nIndex]; if (wcDelimiterLeading == wcCurrent && (L'\0' == wcDelimiterTrailing || (nIndex + 1 < nSize && wcDelimiterTrailing == pTemp[nIndex + 1]))) { if (bInQuote) continue; // New Cell std::wstring sCellText(pTemp + nStartCell, nIndex - nStartCell); if (1 == AddCell(sCellText, nStartCell, oDeleteChars, *pRow, nIndexRow, nIndexCol++, bIsWrap)) { bMsLimitCell = true; } oDeleteChars = std::stack(); bIsWrap = false; if (nIndex + nDelimiterSize == nSize) { if(readToCache) { pRow->storeXmlCache(); pWorksheet->m_oSheetData->AddRowToCache(*pRow); delete pRow; pRow = NULL; } else { pWorksheet->m_oSheetData->m_arrItems.push_back(pRow); pRow = NULL; } } if (nIndex + nDelimiterSize > 500000) { nStartCell = 0; sFileDataW.erase(0, nIndex + nDelimiterSize); nSize -= (nIndex + nDelimiterSize); nIndex = 0; pTemp = sFileDataW.c_str(); } else nStartCell = nIndex + nDelimiterSize; } else if (wcNewLineN == wcCurrent || wcNewLineR == wcCurrent) { if (bInQuote) { // Добавим Wrap bIsWrap = true; continue; } // New line if (nStartCell != nIndex) { std::wstring sCellText(pTemp + nStartCell, nIndex - nStartCell); if (1 == AddCell(sCellText, nStartCell, oDeleteChars, *pRow, nIndexRow, nIndexCol++, bIsWrap)) { bMsLimitCell = true; } bIsWrap = false; } if (wcNewLineR == wcCurrent && nIndex + 1 != nSize && wcNewLineN == pTemp[nIndex + 1]) { // На комбинацию \r\n должен быть только 1 перенос ++nIndex; } if (nIndex + 1 > 500000) { nStartCell = 0; sFileDataW.erase(0, nIndex + 1); nSize -= (nIndex + 1); nIndex = 0; pTemp = sFileDataW.c_str(); } else nStartCell = nIndex + 1; if(readToCache) { pRow->storeXmlCache(); pWorksheet->m_oSheetData->AddRowToCache(*pRow); delete pRow; pRow = NULL; } else pWorksheet->m_oSheetData->m_arrItems.push_back(pRow); pRow = new OOX::Spreadsheet::CRow(); pRow->m_oR.Init(); pRow->m_oR->SetValue(++nIndexRow + 1); nIndexCol = 0; if (pWorksheet->m_oSheetData->m_arrItems.size() > 1048576) { bMsLimit = true; break; // ограниечние мс } } else if (wcQuote == wcCurrent) { // Quote if (false == bInQuote && nStartCell == nIndex && nIndex + 1 != nSize) { // Начало новой ячейки (только если мы сразу после разделителя и не в конце файла) bInQuote = !bInQuote; nStartCell = nIndex + 1; } else if (bInQuote) { // Нужно удалить кавычку ограничитель oDeleteChars.push(nIndex); // Если следующий символ кавычка, то мы не закончили ограничитель строки (1997,Ford,E350,"Super, ""luxurious"" truck") if (nIndex + 1 != nSize && wcQuote == pTemp[nIndex + 1]) ++nIndex; else bInQuote = !bInQuote; } } else if (wcTab == wcCurrent) { // delete tab if not delimiter oDeleteChars.push(nIndex); } } if (nStartCell != nSize && !bMsLimit) { while (nSize > 0) { if (pTemp[nSize - 1] != L'\0') break; else nSize--; } std::wstring sCellText(pTemp + nStartCell, nSize - nStartCell); if (1 == AddCell(sCellText, nStartCell, oDeleteChars, *pRow, nIndexRow, nIndexCol++, bIsWrap)) { bMsLimitCell = true; } if(readToCache) { pRow->storeXmlCache(); pWorksheet->m_oSheetData->AddRowToCache(*pRow); delete pRow; pRow = NULL; } else pWorksheet->m_oSheetData->m_arrItems.push_back(pRow); } else { RELEASEOBJECT(pRow); } oXlsx.m_arWorksheets.push_back(pWorksheet.GetPointer()); smart_ptr oWorksheetFile = pWorksheet.smart_dynamic_cast(); const OOX::RId oRid = oXlsx.m_pWorkbook->Add(oWorksheetFile); oXlsx.m_mapWorksheets.insert(std::make_pair(oRid.ToString(), pWorksheet.GetPointer())); // for bin OOX::Spreadsheet::CSheet *pSheet = new OOX::Spreadsheet::CSheet(); pSheet->m_oName = L"Sheet1"; pSheet->m_oSheetId.Init(); pSheet->m_oSheetId->SetValue(1); pSheet->m_oRid.Init(); pSheet->m_oRid->SetValue(oRid.ToString()); oXlsx.m_pWorkbook->m_oSheets.Init(); oXlsx.m_pWorkbook->m_oSheets->m_arrItems.push_back(pSheet); return bMsLimit ? AVS_FILEUTILS_ERROR_CONVERT_ROWLIMITS : (bMsLimitCell ? AVS_FILEUTILS_ERROR_CONVERT_CELLLIMITS : 0); } //---------------------------------------------------------------------------------- CSVReader::CSVReader() : impl_(new CSVReader::Impl()) { } CSVReader::~CSVReader() { } _UINT32 CSVReader::Read(const std::wstring &sFileName, OOX::Spreadsheet::CXlsx &oXlsx, _UINT32 nCodePage, const std::wstring& sDelimiter, _INT32 lcid) { return impl_->Read(sFileName, oXlsx, nCodePage, sDelimiter, lcid, readToxmlCache_); }