Files
Yajbir Singh f1b860b25c
Some checks failed
check / markdownlint (push) Has been cancelled
check / spellchecker (push) Has been cancelled
updated
2025-12-11 19:03:17 +05:30

438 lines
11 KiB
C++

/*
* (c) Copyright Ascensio System SIA 2010-2023
*
* This program is a free software product. You can redistribute it and/or
* modify it under the terms of the GNU Affero General Public License (AGPL)
* version 3 as published by the Free Software Foundation. In accordance with
* Section 7(a) of the GNU AGPL its Section 15 shall be amended to the effect
* that Ascensio System SIA expressly excludes the warranty of non-infringement
* of any third-party rights.
*
* This program is distributed WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. For
* details, see the GNU AGPL at: http://www.gnu.org/licenses/agpl-3.0.html
*
* You can contact Ascensio System SIA at 20A-6 Ernesta Birznieka-Upish
* street, Riga, Latvia, EU, LV-1050.
*
* The interactive user interfaces in modified source and object code versions
* of the Program must display Appropriate Legal Notices, as required under
* Section 5 of the GNU AGPL version 3.
*
* Pursuant to Section 7(b) of the License you must retain the original Product
* logo when distributing the program. Pursuant to Section 7(e) we decline to
* grant you any rights under trademark law for use of our trademarks.
*
* All the Product's GUI elements, including illustrations and icon sets, as
* well as technical writing content are licensed under the terms of the
* Creative Commons Attribution-ShareAlike 4.0 International. See the License
* terms at http://creativecommons.org/licenses/by-sa/4.0/legalcode
*
*/
#include "./TextHyphen.h"
#include <iostream>
#include <map>
#include <sstream>
#include "../../Common/3dParty/hyphen/hyphen/hnjalloc.h"
#include "../../Common/3dParty/hyphen/hyphen/hnjalloc.c"
#include "../../Common/3dParty/hyphen/hyphen/hyphen.c"
#ifndef HYPHEN_ENGINE_DISABLE_FILESYSTEM
#include "../common/Directory.h"
#endif
// function from hyphen.c using std::ifstream
HyphenDict* hnj_hyphen_load_stream(std::istream &in)
{
HyphenDict *dict[2];
HashTab *hashtab;
char buf[MAX_CHARS];
int nextlevel = 0;
int i, j, k;
HashEntry *e;
int state_num = 0;
/* loading one or two dictionaries (separated by NEXTLEVEL keyword) */
for (k = 0; k < 2; k++)
{
hashtab = hnj_hash_new();
#ifdef VERBOSE
global[k] = hashtab;
#endif
hnj_hash_insert (hashtab, "", 0);
dict[k] = (HyphenDict *) hnj_malloc (sizeof(HyphenDict));
dict[k]->num_states = 1;
dict[k]->states = (HyphenState *) hnj_malloc (sizeof(HyphenState));
dict[k]->states[0].match = NULL;
dict[k]->states[0].repl = NULL;
dict[k]->states[0].fallback_state = -1;
dict[k]->states[0].num_trans = 0;
dict[k]->states[0].trans = NULL;
dict[k]->nextlevel = NULL;
dict[k]->lhmin = 0;
dict[k]->rhmin = 0;
dict[k]->clhmin = 0;
dict[k]->crhmin = 0;
dict[k]->nohyphen = NULL;
dict[k]->nohyphenl = 0;
/* read in character set info */
if (k == 0)
{
for (i = 0; i < MAX_NAME; i++)
dict[k]->cset[i]= 0;
if (in >> dict[k]->cset)
{
for (i = 0; i < MAX_NAME; i++)
if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
dict[k]->cset[i] = 0;
}
else
{
dict[k]->cset[0] = 0;
}
dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
}
else
{
strncpy(dict[k]->cset, dict[0]->cset, sizeof(dict[k]->cset)-1);
dict[k]->cset[sizeof(dict[k]->cset)-1] = '\0';
dict[k]->utf8 = dict[0]->utf8;
}
if (k == 0 || nextlevel)
{
while (in.getline(buf, sizeof(buf), '\n'))
{
if(strlen(buf) < sizeof(buf))
strcat(buf, "\n");
/* discard lines that don't fit in buffer */
if (!in.eof() && strchr(buf, '\n') == NULL)
{
int c;
while ((c = in.get()) != '\n' && c != EOF);
/* issue warning if not a comment */
if (buf[0] != '%')
{
fprintf(stderr, "Warning: skipping too long pattern (more than %lu chars)\n", sizeof(buf));
}
continue;
}
if (strncmp(buf, "NEXTLEVEL", 9) == 0)
{
nextlevel = 1;
break;
}
else if (buf[0] != '%')
{
hnj_hyphen_load_line(buf, dict[k], hashtab);
}
}
}
else if (k == 1)
{
/* default first level: hyphen and ASCII apostrophe */
if (!dict[0]->utf8)
hnj_hyphen_load_line((char*)"NOHYPHEN ',-\n", dict[k], hashtab);
else
hnj_hyphen_load_line((char*)"NOHYPHEN ',\xe2\x80\x93,\xe2\x80\x99,-\n", dict[k], hashtab);
strncpy(buf, (char*)"1-1\n", MAX_CHARS - 1); /* buf rewritten by hnj_hyphen_load here */
buf[MAX_CHARS-1] = '\0';
hnj_hyphen_load_line(buf, dict[k], hashtab); /* remove hyphen */
hnj_hyphen_load_line((char*)"1'1\n", dict[k], hashtab); /* ASCII apostrophe */
if (dict[0]->utf8)
{
hnj_hyphen_load_line((char*)"1\xe2\x80\x93" "1\n", dict[k], hashtab); /* endash */
hnj_hyphen_load_line((char*)"1\xe2\x80\x99" "1\n", dict[k], hashtab); /* apostrophe */
}
}
/* Could do unioning of matches here (instead of the preprocessor script).
If we did, the pseudocode would look something like this:
foreach state in the hash table
foreach i = [1..length(state) - 1]
state to check is substr (state, i)
look it up
if found, and if there is a match, union the match in.
It's also possible to avoid the quadratic blowup by doing the
search in order of increasing state string sizes - then you
can break the loop after finding the first match.
This step should be optional in any case - if there is a
preprocessed rule table, it's always faster to use that.
*/
/* put in the fallback states */
for (i = 0; i < HASH_SIZE; i++)
for (e = hashtab->entries[i]; e; e = e->next)
{
if (*(e->key)) for (j = 1; 1; j++)
{
state_num = hnj_hash_lookup (hashtab, e->key + j);
if (state_num >= 0)
break;
}
/* KBH: FIXME state 0 fallback_state should always be -1? */
if (e->val)
dict[k]->states[e->val].fallback_state = state_num;
}
#ifdef VERBOSE
for (i = 0; i < HASH_SIZE; i++)
for (e = hashtab->entries[i]; e; e = e->next)
{
printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
dict[k]->states[e->val].fallback_state);
for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
dict[k]->states[e->val].trans[j].new_state);
}
#endif
#ifndef VERBOSE
hnj_hash_free (hashtab);
#endif
state_num = 0;
}
if (nextlevel) dict[0]->nextlevel = dict[1];
else
{
dict[1] -> nextlevel = dict[0];
dict[1]->lhmin = dict[0]->lhmin;
dict[1]->rhmin = dict[0]->rhmin;
dict[1]->clhmin = (dict[0]->clhmin) ? dict[0]->clhmin : ((dict[0]->lhmin) ? dict[0]->lhmin : 3);
dict[1]->crhmin = (dict[0]->crhmin) ? dict[0]->crhmin : ((dict[0]->rhmin) ? dict[0]->rhmin : 3);
#ifdef VERBOSE
HashTab *r = global[0];
global[0] = global[1];
global[1] = r;
#endif
return dict[1];
}
return dict[0];
}
namespace NSHyphen
{
class CEngine_private
{
public:
static std::wstring m_sDirectory;
int m_nCacheSize;
std::map<int, HyphenDict*> m_mapDicts;
int m_nLastLang;
HyphenDict* m_pLastDict;
// работаем всегда в пределах одной памяти
char* m_pHyphenVector;
size_t m_nHyphenVectorSize;
CEngine_private()
{
m_nHyphenVectorSize = 100;
m_pHyphenVector = new char[m_nHyphenVectorSize];
memset(m_pHyphenVector, 0, m_nHyphenVectorSize);
m_nLastLang = 0;
m_pLastDict = NULL;
#ifndef HYPHEN_ENGINE_DISABLE_FILESYSTEM
m_nCacheSize = 5;
#else
m_nCacheSize = -1;
#endif
}
~CEngine_private()
{
delete [] m_pHyphenVector;
}
void AddLast()
{
if (m_pLastDict)
{
if (m_nCacheSize > 0 && m_mapDicts.size() == m_nCacheSize)
{
std::map<int, HyphenDict*>::iterator first = m_mapDicts.begin();
delete first->second;
m_mapDicts.erase(first);
}
m_mapDicts.insert(std::pair<int, HyphenDict*>{m_nLastLang, m_pLastDict});
}
}
int LoadDictionary(const int& lang)
{
m_nLastLang = lang;
m_pLastDict = NULL;
#ifndef HYPHEN_ENGINE_DISABLE_FILESYSTEM
std::wstring sFilePath = GetDictionaryPath(m_nLastLang);
if (m_sDirectory.empty())
m_sDirectory = NSFile::GetProcessDirectory() + L"/dictionaries";
if (NSFile::CFileBinary::Exists(sFilePath))
{
FILE* f = NSFile::CFileBinary::OpenFileNative(sFilePath, L"r");
if (f == NULL)
return 1;
m_pLastDict = hnj_hyphen_load_file(f);
fclose(f);
}
#endif
AddLast();
return (NULL == m_pLastDict) ? 1 : 0;
}
int LoadDictionary(const int& lang, const unsigned char* data, const unsigned int& data_len)
{
m_nLastLang = lang;
m_pLastDict = NULL;
std::stringstream ss;
ss.write((const char*)data, data_len);
m_pLastDict = hnj_hyphen_load_stream(ss);
AddLast();
return (NULL == m_pLastDict) ? 1 : 0;
}
#ifndef HYPHEN_ENGINE_DISABLE_FILESYSTEM
std::wstring GetDictionaryPath(const int& lang)
{
if (m_sDirectory.empty())
m_sDirectory = NSFile::GetProcessDirectory() + L"/dictionaries";
for (int i = 0; i < NSTextLanguages::DictionaryRec_count; ++i)
{
if (m_nLastLang == NSTextLanguages::Dictionaries[i].m_lang)
{
const char* sNameStr = NSTextLanguages::Dictionaries[i].m_name;
std::wstring sNameU = NSFile::CUtf8Converter::GetUnicodeStringFromUTF8((BYTE*)sNameStr, (LONG)(strlen(sNameStr)));
std::wstring sFilePath = m_sDirectory + L"/" + sNameU + L"/hyph_" + sNameU + L".dic";
return sFilePath;
}
}
return L"";
}
#endif
char* Process(const int& lang, const char* word, const int& len)
{
// resize 2x
if (len + 5 > m_nHyphenVectorSize)
{
delete[] m_pHyphenVector;
m_nHyphenVectorSize = 2 * (len + 5);
m_pHyphenVector = new char[m_nHyphenVectorSize];
memset(m_pHyphenVector, 0, m_nHyphenVectorSize);
}
else
{
// обнуляем после последнего использования
char* mem = m_pHyphenVector;
while (*mem)
*mem++ = 0;
}
if (m_nLastLang != lang)
{
std::map<int, HyphenDict*>::iterator find = m_mapDicts.find(lang);
if (find != m_mapDicts.end())
{
m_nLastLang = lang;
m_pLastDict = find->second;
}
else
{
#ifndef HYPHEN_ENGINE_DISABLE_FILESYSTEM
LoadDictionary(lang);
#else
m_nLastLang = lang;
m_pLastDict = NULL;
#endif
}
}
if (m_pLastDict)
{
char **rep = NULL;
int *pos = NULL;
int *cut = NULL;
hnj_hyphen_hyphenate2(m_pLastDict, word, len, m_pHyphenVector, NULL, &rep, &pos, &cut);
return m_pHyphenVector;
}
return NULL;
}
};
std::wstring CEngine_private::m_sDirectory = L"";
CEngine::CEngine()
{
m_internal = new CEngine_private();
}
CEngine::~CEngine()
{
delete m_internal;
}
void CEngine::Init(const std::wstring& directory)
{
CEngine_private::m_sDirectory = directory;
}
void CEngine::SetCacheSize(const int& size)
{
#ifndef HYPHEN_ENGINE_DISABLE_FILESYSTEM
if (0 == m_internal->m_mapDicts.size())
m_internal->m_nCacheSize = size;
#endif
}
int CEngine::LoadDictionary(const int& lang)
{
return m_internal->LoadDictionary(lang);
}
int CEngine::LoadDictionary(const int& lang, const unsigned char* data, const unsigned int& data_len)
{
return m_internal->LoadDictionary(lang, data, data_len);
}
char* CEngine::Process(const int& lang, const char* word, const int& len)
{
return m_internal->Process(lang, word, (len == -1) ? strlen(word) : len);
}
bool CEngine::IsDictionaryExist(const int& lang)
{
for (int i = 0; i < NSTextLanguages::DictionaryRec_count; ++i)
{
if (lang == NSTextLanguages::Dictionaries[i].m_lang)
return true;
}
return false;
}
}