#ifndef HTMLTOXHTML_H #define HTMLTOXHTML_H #include #include #include #include #include #include "gumbo-parser/src/gumbo.h" #include "../../../DesktopEditor/common/File.h" #include "../../../DesktopEditor/common/Directory.h" #include "../../../DesktopEditor/common/StringBuilder.h" #include "../../../DesktopEditor/xml/include/xmlutils.h" #include "../../../UnicodeConverter/UnicodeConverter.h" #include "../../../HtmlFile2/src/StringFinder.h" #if defined(CreateDirectory) #undef CreateDirectory #endif static std::string nonbreaking_inline = "|a|abbr|acronym|b|bdo|big|cite|code|dfn|em|font|i|img|kbd|nobr|s|small|span|strike|strong|sub|sup|tt|"; static std::string empty_tags = "|area|base|basefont|bgsound|br|command|col|embed|event-source|frame|hr|image|img|input|keygen|link|menuitem|meta|param|source|spacer|track|wbr|"; static std::string preserve_whitespace = "|pre|textarea|script|style|"; static std::string special_handling = "|html|body|"; static std::string treat_like_inline = "|p|"; static std::vector html_tags = {"div","span","a","img","p","h1","h2","h3","h4","h5","h6", "ul", "ol", "li","td","tr","table","thead","tbody","tfoot","th", "br","form","input","button","section","nav","header","footer", "main","figure","figcaption","strong","em","i", "b", "u","pre", "code","blockquote","hr","script","link","meta","style","title", "head","body","html","legend","optgroup","option","select","dl", "dt","dd","time","data","abbr","address","area","base","bdi", "bdo","cite","col","iframe","video","source","track","textarea", "label","fieldset","colgroup","del","ins","details","summary", "dialog","embed","kbd","map","mark","menu","meter","object", "output","param","progress","q","samp","small","sub","sup","var", "wbr","acronym","applet","article","aside","audio","basefont", "bgsound","big","blink","canvas","caption","center","command", "comment","datalist","dfn","dir","font","frame","frameset", "hgroup","isindex","keygen","marquee","nobr","noembed","noframes", "noscript","plaintext","rp","rt","ruby","s","strike","tt","xmp"}; static std::vector unchecked_nodes_new = {"svg"}; static void prettyprint(GumboNode*, NSStringUtils::CStringBuilderA& oBuilder, bool bCheckValidNode = true); static std::string mhtTohtml(const std::string &sFileContent); // Заменяет в строке s все символы s1 на s2 static void replace_all(std::string& s, const std::string& s1, const std::string& s2) { size_t pos = s.find(s1); while(pos != std::string::npos) { s.replace(pos, s1.length(), s2); pos = s.find(s1, pos + s2.length()); } } static bool NodeIsUnprocessed(const std::string& wsTagName) { return "xml" == wsTagName; } static bool IsUnckeckedNodes(const std::string& sValue) { return unchecked_nodes_new.end() != std::find(unchecked_nodes_new.begin(), unchecked_nodes_new.end(), sValue); } static std::wstring htmlToXhtml(std::string& sFileContent, bool bNeedConvert) { if (bNeedConvert) { // Определение кодировки std::string sEncoding = NSStringFinder::FindProperty(sFileContent, "charset", {"="}, {";", "\\n", "\\r", " ", "\"", "'"}).m_sValue; if (sEncoding.empty()) sEncoding = NSStringFinder::FindProperty(sFileContent, "encoding", {"="}, {";", "\\n", "\\r", " "}).m_sValue; if (!sEncoding.empty() && !NSStringFinder::Equals("utf-8", sEncoding)) { NSUnicodeConverter::CUnicodeConverter oConverter; sFileContent = U_TO_UTF8(oConverter.toUnicode(sFileContent, sEncoding.c_str())); } } // Избавляемся от лишних символов до <... boost::regex oRegex("<[a-zA-Z]"); boost::match_results oResult; if (boost::regex_search(sFileContent, oResult, oRegex)) sFileContent.erase(0, oResult.position()); //Избавление от while (NSStringFinder::RemoveEmptyTag(sFileContent, "a")); //Избавление от while (NSStringFinder::RemoveEmptyTag(sFileContent, "title")); //Избавление от while (NSStringFinder::RemoveEmptyTag(sFileContent, "script")); // Gumbo GumboOptions options = kGumboDefaultOptions; GumboOutput* output = gumbo_parse_with_options(&options, sFileContent.data(), sFileContent.length()); // prettyprint NSStringUtils::CStringBuilderA oBuilder; prettyprint(output->document, oBuilder); // Конвертирование из string utf8 в wstring return UTF8_TO_U(oBuilder.GetData()); } static std::string Base64ToString(const std::string& sContent, const std::string& sCharset) { std::string sRes; int nSrcLen = (int)sContent.length(); int nDecodeLen = NSBase64::Base64DecodeGetRequiredLength(nSrcLen); BYTE* pData = new BYTE[nDecodeLen]; if (TRUE == NSBase64::Base64Decode(sContent.c_str(), nSrcLen, pData, &nDecodeLen)) { std::wstring sConvert; if(!sCharset.empty() && NSStringFinder::Equals("utf-8", sCharset)) { NSUnicodeConverter::CUnicodeConverter oConverter; sConvert = oConverter.toUnicode(reinterpret_cast(pData), (unsigned)nDecodeLen, sCharset.data()); } sRes = sConvert.empty() ? std::string(reinterpret_cast(pData), nDecodeLen) : U_TO_UTF8(sConvert); } RELEASEARRAYOBJECTS(pData); return sRes; } static std::string QuotedPrintableDecode(const std::string& sContent, std::string& sCharset) { NSStringUtils::CStringBuilderA sRes; size_t ip = 0; size_t i = sContent.find('='); if(i == 0) { size_t nIgnore = 12; std::string charset = sContent.substr(0, nIgnore); if(charset == "=00=00=FE=FF") sCharset = "UTF-32BE"; else if(charset == "=FF=FE=00=00") sCharset = "UTF-32LE"; else if(charset == "=2B=2F=76=38" || charset == "=2B=2F=76=39" || charset == "=2B=2F=76=2B" || charset == "=2B=2F=76=2F") sCharset = "UTF-7"; else if(charset == "=DD=73=66=73") sCharset = "UTF-EBCDIC"; else if(charset == "=84=31=95=33") sCharset = "GB-18030"; else { nIgnore -= 3; charset.erase(nIgnore); if(charset == "=EF=BB=BF") sCharset = "UTF-8"; else if(charset == "=F7=64=4C") sCharset = "UTF-1"; else if(charset == "=0E=FE=FF") sCharset = "SCSU"; else if(charset == "=FB=EE=28") sCharset = "BOCU-1"; else { nIgnore -= 3; charset.erase(nIgnore); if(charset == "=FE=FF") sCharset = "UTF-16BE"; else if(charset == "=FF=FE") sCharset = "UTF-16LE"; else nIgnore -= 6; } } ip = nIgnore; i = sContent.find('=', ip); } while(i != std::string::npos && i + 2 < sContent.length()) { sRes.WriteString(sContent.c_str() + ip, i - ip); std::string str = sContent.substr(i + 1, 2); if(str.front() == '\n' || str.front() == '\r') { char ch = str[1]; if(ch != '\n' && ch != '\r') sRes.WriteString(&ch, 1); } else { char* err; char ch = (int)strtol(str.data(), &err, 16); if(*err) sRes.WriteString('=' + str); else sRes.WriteString(&ch, 1); } ip = i + 3; i = sContent.find('=', ip); } if(ip != std::string::npos) sRes.WriteString(sContent.c_str() + ip); return sRes.GetData(); } static void ReadMht(const std::string& sMhtContent, std::map& sRes, NSStringUtils::CStringBuilderA& oRes) { size_t unContentPosition = 0, unCharsetBegin = 0, unCharsetEnd = std::string::npos; NSStringFinder::TFoundedData oData; // Content-Type oData = NSStringFinder::FindProperty(sMhtContent, "content-type", {":"}, {";", "\\n", "\\r"}); const std::string sContentType{oData.m_sValue}; if (sContentType.empty()) return; if (NSStringFinder::Equals(sContentType, "multipart/alternative")) { oRes.WriteString(mhtTohtml(sMhtContent.substr(oData.m_unEndPosition, sMhtContent.length() - oData.m_unEndPosition))); return; } unContentPosition = std::max(unContentPosition, oData.m_unEndPosition); unCharsetBegin = oData.m_unEndPosition; // name // std::string sName = NSStringFinder::FindProperty(sMhtContent, "name", {"="}, {";", "\\n", "\\r"}, 0, unLastPosition); // unContentPosition = std::max(unContentPosition, unLastPosition); // Content-Location oData = NSStringFinder::FindProperty(sMhtContent, "content-location", {":"}, {";", "\\n", "\\r"}); std::string sContentLocation{oData.m_sValue}; if (!oData.Empty()) unContentPosition = std::max(unContentPosition, oData.m_unEndPosition); // Content-ID oData = NSStringFinder::FindProperty(sMhtContent, "content-id", {":"}, {";", "\\n", "\\r"}); std::string sContentID{oData.m_sValue}; if (!oData.Empty()) { unContentPosition = std::max(unContentPosition, oData.m_unEndPosition); unCharsetEnd = std::min(unCharsetEnd, oData.m_unBeginPosition); NSStringFinder::CutInside(sContentID, "<", ">"); } if (sContentLocation.empty() && !sContentID.empty()) sContentLocation = "cid:" + sContentID; // Content-Transfer-Encoding oData = NSStringFinder::FindProperty(sMhtContent, "content-transfer-encoding", {":"}, {";", "\\n", "\\r"}); const std::string sContentEncoding{oData.m_sValue}; if (!oData.Empty()) { unContentPosition = std::max(unContentPosition, oData.m_unEndPosition); unCharsetEnd = std::min(unCharsetEnd, oData.m_unBeginPosition); } // charset std::string sCharset = "utf-8"; if (std::string::npos != unCharsetEnd && unCharsetBegin < unCharsetEnd) { sCharset = NSStringFinder::FindProperty(sMhtContent.substr(unCharsetBegin, unCharsetEnd - unCharsetBegin), "charset", {"="}, {";", "\\n", "\\r"}).m_sValue; NSStringFinder::CutInside(sCharset, "\""); } // Content std::string sContent = sMhtContent.substr(unContentPosition, sMhtContent.length() - unContentPosition); // std::wstring sExtention = NSFile::GetFileExtention(UTF8_TO_U(sName)); // std::transform(sExtention.begin(), sExtention.end(), sExtention.begin(), tolower); // Основной документ if (NSStringFinder::Equals(sContentType, "multipart/alternative")) oRes.WriteString(mhtTohtml(sContent)); else if ((NSStringFinder::Find(sContentType, "text") /*&& (sExtention.empty() || NSStringFinder::EqualOf(sExtention, {L"htm", L"html", L"xhtml", L"css"}))*/) || (NSStringFinder::Equals(sContentType, "application/octet-stream") && NSStringFinder::Find(sContentLocation, "css"))) { // Стили заключаются в тэг "); } // Картинки else if ((NSStringFinder::Find(sContentType, "image") /*|| NSStringFinder::Equals(sExtention, L"gif")*/ || NSStringFinder::Equals(sContentType, "application/octet-stream")) && NSStringFinder::Equals(sContentEncoding, "base64")) { // if (NSStringFinder::Equals(sExtention, L"ico") || NSStringFinder::Find(sContentType, "ico")) // sContentType = "image/jpg"; // else if(NSStringFinder::Equals(sExtention, L"gif")) // sContentType = "image/gif"; int nSrcLen = (int)sContent.length(); int nDecodeLen = NSBase64::Base64DecodeGetRequiredLength(nSrcLen); BYTE* pData = new BYTE[nDecodeLen]; if (TRUE == NSBase64::Base64Decode(sContent.c_str(), nSrcLen, pData, &nDecodeLen)) sRes.insert(std::make_pair(sContentLocation, "data:" + sContentType + ";base64," + sContent)); RELEASEARRAYOBJECTS(pData); } } static std::string mhtTohtml(const std::string& sFileContent) { std::map sRes; NSStringUtils::CStringBuilderA oRes; // Поиск boundary NSStringFinder::TFoundedData oData{NSStringFinder::FindProperty(sFileContent, "boundary", {"="}, {"\\r", "\\n", "\""})}; size_t nFound{oData.m_unEndPosition}; std::string sBoundary{oData.m_sValue}; if (sBoundary.empty()) { size_t nFoundEnd = sFileContent.length(); nFound = 0; ReadMht(sFileContent.substr(nFound, nFoundEnd), sRes, oRes); return oRes.GetData(); } NSStringFinder::CutInside(sBoundary, "\""); size_t nFoundEnd{nFound}; sBoundary = "--" + sBoundary; size_t nBoundaryLength = sBoundary.length(); nFound = sFileContent.find(sBoundary, nFound) + nBoundaryLength; // Цикл по boundary while(nFound != std::string::npos) { nFoundEnd = sFileContent.find(sBoundary, nFound + nBoundaryLength); if(nFoundEnd == std::string::npos) break; ReadMht(sFileContent.substr(nFound, nFoundEnd - nFound), sRes, oRes); nFound = sFileContent.find(sBoundary, nFoundEnd); } std::string sFile = oRes.GetData(); for(const std::pair& item : sRes) { std::string sName = item.first; size_t found = sFile.find(sName); size_t sfound = sName.rfind('/'); if(found == std::string::npos && sfound != std::string::npos) found = sFile.find(sName.erase(0, sfound + 1)); while(found != std::string::npos) { size_t fq = sFile.find_last_of("\"\'>=", found); if (std::string::npos == fq) break; char ch = sFile[fq]; if(ch != '\"' && ch != '\'') fq++; size_t tq = sFile.find_first_of("\"\'<> ", found) + 1; if (std::string::npos == tq) break; if(sFile[tq] != '\"' && sFile[tq] != '\'') tq--; if(ch != '>') { std::string is = '\"' + item.second + '\"'; sFile.replace(fq, tq - fq, is); found = sFile.find(sName, fq + is.length()); } else found = sFile.find(sName, tq); } } return sFile; } static std::wstring mhtToXhtml(std::string& sFileContent) { sFileContent = mhtTohtml(sFileContent); // Gumbo GumboOptions options = kGumboDefaultOptions; GumboOutput* output = gumbo_parse_with_options(&options, sFileContent.data(), sFileContent.length()); // prettyprint NSStringUtils::CStringBuilderA oBuilder; prettyprint(output->document, oBuilder); // Конвертирование из string utf8 в wstring return UTF8_TO_U(oBuilder.GetData()); } // Заменяет сущности &,<,> в text static void substitute_xml_entities_into_text(std::string& text) { // replacing & must come first replace_all(text, "&", "&"); replace_all(text, "<", "<"); replace_all(text, ">", ">"); } // After running through Gumbo, the values of type "" are replaced with the corresponding code '0x01' // Since the attribute value does not use control characters (value <= 0x09), // then just delete them, otherwise XmlUtils::CXmlLiteReader crashes on them. // bug#73486 static void remove_control_symbols(std::string& text) { std::string::iterator itFound = std::find_if(text.begin(), text.end(), [](unsigned char chValue){ return chValue <= 0x09; }); while (itFound != text.end()) { itFound = text.erase(itFound); itFound = std::find_if(itFound, text.end(), [](unsigned char chValue){ return chValue <= 0x09; }); } } // Заменяет сущности " в text static void substitute_xml_entities_into_attributes(std::string& text) { remove_control_symbols(text); substitute_xml_entities_into_text(text); replace_all(text, "\"", """); } static std::string handle_unknown_tag(GumboStringPiece* text) { if (text->data == NULL) return ""; GumboStringPiece gsp = *text; gumbo_tag_from_original_text(&gsp); std::string sAtr = std::string(gsp.data, gsp.length); size_t found = sAtr.find_first_of("-'+,./=?;!*#@$_%<>&;\"\'()[]{}"); while(found != std::string::npos) { sAtr.erase(found, 1); found = sAtr.find_first_of("-'+,./=?;!*#@$_%<>&;\"\'()[]{}", found); } return sAtr; } static std::string get_tag_name(GumboNode* node) { std::string tagname = (node->type == GUMBO_NODE_DOCUMENT ? "document" : gumbo_normalized_tagname(node->v.element.tag)); if (tagname.empty()) tagname = handle_unknown_tag(&node->v.element.original_tag); return tagname; } static void build_doctype(GumboNode* node, NSStringUtils::CStringBuilderA& oBuilder) { if (node->v.document.has_doctype) { oBuilder.WriteString("v.document.name); std::string pi(node->v.document.public_identifier); remove_control_symbols(pi); if ((node->v.document.public_identifier != NULL) && !pi.empty()) { oBuilder.WriteString(" PUBLIC \""); oBuilder.WriteString(pi); oBuilder.WriteString("\" \""); oBuilder.WriteString(node->v.document.system_identifier); oBuilder.WriteString("\""); } oBuilder.WriteString(">"); } } static void build_attributes(const GumboVector* attribs, NSStringUtils::CStringBuilderA& atts) { std::vector arrRepeat; for (size_t i = 0; i < attribs->length; ++i) { GumboAttribute* at = static_cast(attribs->data[i]); std::string sVal(at->value); std::string sName(at->name); remove_control_symbols(sVal); remove_control_symbols(sName); atts.WriteString(" "); bool bCheck = false; size_t nBad = sName.find_first_of("+,.=?#%<>&;\"\'()[]{}"); while(nBad != std::string::npos) { sName.erase(nBad, 1); nBad = sName.find_first_of("+,.=?#%<>&;\"\'()[]{}", nBad); if(sName.empty()) break; bCheck = true; } if(sName.empty()) continue; while(sName.front() >= '0' && sName.front() <= '9') { sName.erase(0, 1); if(sName.empty()) break; bCheck = true; } if(bCheck) { GumboAttribute* check = gumbo_get_attribute(attribs, sName.c_str()); if(check || std::find(arrRepeat.begin(), arrRepeat.end(), sName) != arrRepeat.end()) continue; else arrRepeat.push_back(sName); } if(sName.empty()) continue; atts.WriteString(sName); // determine original quote character used if it exists std::string qs ="\""; atts.WriteString("="); atts.WriteString(qs); substitute_xml_entities_into_attributes(sVal); atts.WriteString(sVal); atts.WriteString(qs); } } static void prettyprint_contents(GumboNode* node, NSStringUtils::CStringBuilderA& contents, bool bCheckValidNode) { std::string key = "|" + get_tag_name(node) + "|"; bool keep_whitespace = preserve_whitespace.find(key) != std::string::npos; bool is_inline = nonbreaking_inline.find(key) != std::string::npos; bool is_like_inline = treat_like_inline.find(key) != std::string::npos; GumboVector* children = &node->v.element.children; for (size_t i = 0; i < children->length; i++) { GumboNode* child = static_cast (children->data[i]); if (child->type == GUMBO_NODE_TEXT) { std::string val(child->v.text.text); remove_control_symbols(val); substitute_xml_entities_into_text(val); // Избавление от FF size_t found = val.find_first_of("\014"); while(found != std::string::npos) { val.erase(found, 1); found = val.find_first_of("\014", found); } contents.WriteString(val); } else if ((child->type == GUMBO_NODE_ELEMENT) || (child->type == GUMBO_NODE_TEMPLATE)) prettyprint(child, contents, bCheckValidNode); else if (child->type == GUMBO_NODE_WHITESPACE) { if (keep_whitespace || is_inline || is_like_inline) contents.WriteString(child->v.text.text); } else if (child->type != GUMBO_NODE_COMMENT) { // Сообщение об ошибке // Does this actually exist: (child->type == GUMBO_NODE_CDATA) // fprintf(stderr, "unknown element of type: %d\n", child->type); } } } static void prettyprint(GumboNode* node, NSStringUtils::CStringBuilderA& oBuilder, bool bCheckValidNode) { // special case the document node if (node->type == GUMBO_NODE_DOCUMENT) { build_doctype(node, oBuilder); prettyprint_contents(node, oBuilder, bCheckValidNode); return; } std::string tagname = get_tag_name(node); remove_control_symbols(tagname); if (NodeIsUnprocessed(tagname)) return; if (bCheckValidNode) bCheckValidNode = !IsUnckeckedNodes(tagname); if (bCheckValidNode && html_tags.end() == std::find(html_tags.begin(), html_tags.end(), tagname)) { prettyprint_contents(node, oBuilder, bCheckValidNode); return; } std::string close = ""; std::string closeTag = ""; std::string key = "|" + tagname + "|"; bool is_empty_tag = empty_tags.find(key) != std::string::npos; // determine closing tag type if (is_empty_tag) close = "/"; else closeTag = "" + tagname + ">"; // build results oBuilder.WriteString("<" + tagname); // build attr string const GumboVector* attribs = &node->v.element.attributes; build_attributes(attribs, oBuilder); oBuilder.WriteString(close + ">"); // prettyprint your contents prettyprint_contents(node, oBuilder, bCheckValidNode); oBuilder.WriteString(closeTag); } #endif // HTMLTOXHTML_H