ispell_checker.cpp
00001 /* vim: set sw=8: -*- Mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ 00002 /* tdespell2 - adopted from Enchant 00003 * Copyright (C) 2003 Dom Lachowicz 00004 * Copyright (C) 2004 Zack Rusin <zack@kde.org> 00005 * 00006 * This library is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2.1 of the License, or (at your option) any later version. 00010 * 00011 * This library is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with this library; if not, write to the 00018 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00019 * Boston, MA 02110-1301, USA. 00020 * 00021 * In addition, as a special exception, Dom Lachowicz 00022 * gives permission to link the code of this program with 00023 * non-LGPL Spelling Provider libraries (eg: a MSFT Office 00024 * spell checker backend) and distribute linked combinations including 00025 * the two. You must obey the GNU Lesser General Public License in all 00026 * respects for all of the code used other than said providers. If you modify 00027 * this file, you may extend this exception to your version of the 00028 * file, but you are not obligated to do so. If you do not wish to 00029 * do so, delete this exception statement from your version. 00030 */ 00031 00032 #include <config.h> 00033 00034 #include <stdio.h> 00035 #include <stdlib.h> 00036 #include <string.h> 00037 00038 #include <string> 00039 #include <vector> 00040 00041 #include "sp_spell.h" 00042 #include "ispell_checker.h" 00043 00044 #include <tqmap.h> 00045 #include <tqdir.h> 00046 #include <tqfileinfo.h> 00047 00048 /***************************************************************************/ 00049 00050 typedef struct str_ispell_map 00051 { 00052 const char * lang; 00053 const char * dict; 00054 const char * enc; 00055 } IspellMap; 00056 00057 static const char *ispell_dirs [] = { 00058 "/usr/" SYSTEM_LIBDIR "/ispell", 00059 "/usr/lib/ispell", 00060 "/usr/local/" SYSTEM_LIBDIR "/ispell", 00061 "/usr/local/lib/ispell", 00062 "/usr/local/share/ispell", 00063 "/usr/share/ispell", 00064 "/usr/pkg/lib", 00065 0 00066 }; 00067 static const IspellMap ispell_map [] = { 00068 {"ca" ,"catala.hash" ,"iso-8859-1" }, 00069 {"ca_ES" ,"catala.hash" ,"iso-8859-1" }, 00070 {"cs" ,"czech.hash" ,"iso-8859-2" }, 00071 {"cs_CZ" ,"czech.hash" ,"iso-8859-2" }, 00072 {"da" ,"dansk.hash" ,"iso-8859-1" }, 00073 {"da_DK" ,"dansk.hash" ,"iso-8859-1" }, 00074 {"de" ,"deutsch.hash" ,"iso-8859-1" }, 00075 {"de_CH" ,"swiss.hash" ,"iso-8859-1" }, 00076 {"de_AT" ,"deutsch.hash" ,"iso-8859-1" }, 00077 {"de_DE" ,"deutsch.hash" ,"iso-8859-1" }, 00078 {"el" ,"ellhnika.hash" ,"iso-8859-7" }, 00079 {"el_GR" ,"ellhnika.hash" ,"iso-8859-7" }, 00080 {"en" ,"british.hash" ,"iso-8859-1" }, 00081 {"en_AU" ,"british.hash" ,"iso-8859-1" }, 00082 {"en_BZ" ,"british.hash" ,"iso-8859-1" }, 00083 {"en_CA" ,"british.hash" ,"iso-8859-1" }, 00084 {"en_GB" ,"british.hash" ,"iso-8859-1" }, 00085 {"en_IE" ,"british.hash" ,"iso-8859-1" }, 00086 {"en_JM" ,"british.hash" ,"iso-8859-1" }, 00087 {"en_NZ" ,"british.hash" ,"iso-8859-1" }, 00088 {"en_TT" ,"british.hash" ,"iso-8859-1" }, 00089 {"en_ZA" ,"british.hash" ,"iso-8859-1" }, 00090 {"en_ZW" ,"british.hash" ,"iso-8859-1" }, 00091 {"en_PH" ,"american.hash" ,"iso-8859-1" }, 00092 {"en_US" ,"american.hash" ,"iso-8859-1" }, 00093 {"eo" ,"esperanto.hash" ,"iso-8859-3" }, 00094 {"es" ,"espanol.hash" ,"iso-8859-1" }, 00095 {"es_AR" ,"espanol.hash" ,"iso-8859-1" }, 00096 {"es_BO" ,"espanol.hash" ,"iso-8859-1" }, 00097 {"es_CL" ,"espanol.hash" ,"iso-8859-1" }, 00098 {"es_CO" ,"espanol.hash" ,"iso-8859-1" }, 00099 {"es_CR" ,"espanol.hash" ,"iso-8859-1" }, 00100 {"es_DO" ,"espanol.hash" ,"iso-8859-1" }, 00101 {"es_EC" ,"espanol.hash" ,"iso-8859-1" }, 00102 {"es_ES" ,"espanol.hash" ,"iso-8859-1" }, 00103 {"es_GT" ,"espanol.hash" ,"iso-8859-1" }, 00104 {"es_HN" ,"espanol.hash" ,"iso-8859-1" }, 00105 {"es_MX" ,"espanol.hash" ,"iso-8859-1" }, 00106 {"es_NI" ,"espanol.hash" ,"iso-8859-1" }, 00107 {"es_PA" ,"espanol.hash" ,"iso-8859-1" }, 00108 {"es_PE" ,"espanol.hash" ,"iso-8859-1" }, 00109 {"es_PR" ,"espanol.hash" ,"iso-8859-1" }, 00110 {"es_PY" ,"espanol.hash" ,"iso-8859-1" }, 00111 {"es_SV" ,"espanol.hash" ,"iso-8859-1" }, 00112 {"es_UY" ,"espanol.hash" ,"iso-8859-1" }, 00113 {"es_VE" ,"espanol.hash" ,"iso-8859-1" }, 00114 {"fi" ,"finnish.hash" ,"iso-8859-1" }, 00115 {"fi_FI" ,"finnish.hash" ,"iso-8859-1" }, 00116 {"fr" ,"francais.hash" ,"iso-8859-1" }, 00117 {"fr_BE" ,"francais.hash" ,"iso-8859-1" }, 00118 {"fr_CA" ,"francais.hash" ,"iso-8859-1" }, 00119 {"fr_CH" ,"francais.hash" ,"iso-8859-1" }, 00120 {"fr_FR" ,"francais.hash" ,"iso-8859-1" }, 00121 {"fr_LU" ,"francais.hash" ,"iso-8859-1" }, 00122 {"fr_MC" ,"francais.hash" ,"iso-8859-1" }, 00123 {"hu" ,"hungarian.hash" ,"iso-8859-2" }, 00124 {"hu_HU" ,"hungarian.hash" ,"iso-8859-2" }, 00125 {"ga" ,"irish.hash" ,"iso-8859-1" }, 00126 {"ga_IE" ,"irish.hash" ,"iso-8859-1" }, 00127 {"gl" ,"galician.hash" ,"iso-8859-1" }, 00128 {"gl_ES" ,"galician.hash" ,"iso-8859-1" }, 00129 {"ia" ,"interlingua.hash" ,"iso-8859-1" }, 00130 {"it" ,"italian.hash" ,"iso-8859-1" }, 00131 {"it_IT" ,"italian.hash" ,"iso-8859-1" }, 00132 {"it_CH" ,"italian.hash" ,"iso-8859-1" }, 00133 {"la" ,"mlatin.hash" ,"iso-8859-1" }, 00134 {"la_IT" ,"mlatin.hash" ,"iso-8859-1" }, 00135 {"lt" ,"lietuviu.hash" ,"iso-8859-13" }, 00136 {"lt_LT" ,"lietuviu.hash" ,"iso-8859-13" }, 00137 {"nl" ,"nederlands.hash" ,"iso-8859-1" }, 00138 {"nl_NL" ,"nederlands.hash" ,"iso-8859-1" }, 00139 {"nl_BE" ,"nederlands.hash" ,"iso-8859-1" }, 00140 {"nb" ,"norsk.hash" ,"iso-8859-1" }, 00141 {"nb_NO" ,"norsk.hash" ,"iso-8859-1" }, 00142 {"nn" ,"nynorsk.hash" ,"iso-8859-1" }, 00143 {"nn_NO" ,"nynorsk.hash" ,"iso-8859-1" }, 00144 {"no" ,"norsk.hash" ,"iso-8859-1" }, 00145 {"no_NO" ,"norsk.hash" ,"iso-8859-1" }, 00146 {"pl" ,"polish.hash" ,"iso-8859-2" }, 00147 {"pl_PL" ,"polish.hash" ,"iso-8859-2" }, 00148 {"pt" ,"brazilian.hash" ,"iso-8859-1" }, 00149 {"pt_BR" ,"brazilian.hash" ,"iso-8859-1" }, 00150 {"pt_PT" ,"portugues.hash" ,"iso-8859-1" }, 00151 {"ru" ,"russian.hash" ,"koi8-r" }, 00152 {"ru_MD" ,"russian.hash" ,"koi8-r" }, 00153 {"ru_RU" ,"russian.hash" ,"koi8-r" }, 00154 {"sc" ,"sardinian.hash" ,"iso-8859-1" }, 00155 {"sc_IT" ,"sardinian.hash" ,"iso-8859-1" }, 00156 {"sk" ,"slovak.hash" ,"iso-8859-2" }, 00157 {"sk_SK" ,"slovak.hash" ,"iso-8859-2" }, 00158 {"sl" ,"slovensko.hash" ,"iso-8859-2" }, 00159 {"sl_SI" ,"slovensko.hash" ,"iso-8859-2" }, 00160 {"sv" ,"svenska.hash" ,"iso-8859-1" }, 00161 {"sv_SE" ,"svenska.hash" ,"iso-8859-1" }, 00162 {"uk" ,"ukrainian.hash" ,"koi8-u" }, 00163 {"uk_UA" ,"ukrainian.hash" ,"koi8-u" }, 00164 {"yi" ,"yiddish-yivo.hash" ,"utf-8" } 00165 }; 00166 00167 static const size_t size_ispell_map = ( sizeof(ispell_map) / sizeof((ispell_map)[0]) ); 00168 static TQMap<TQString, TQString> ispell_dict_map; 00169 00170 00171 void 00172 ISpellChecker::try_autodetect_charset(const char * const inEncoding) 00173 { 00174 if (inEncoding && strlen(inEncoding)) 00175 { 00176 m_translate_in = TQTextCodec::codecForName(inEncoding); 00177 } 00178 } 00179 00180 /***************************************************************************/ 00181 /***************************************************************************/ 00182 00183 ISpellChecker::ISpellChecker() 00184 : deftflag(-1), 00185 prefstringchar(-1), 00186 m_bSuccessfulInit(false), 00187 m_BC(NULL), 00188 m_cd(NULL), 00189 m_cl(NULL), 00190 m_cm(NULL), 00191 m_ho(NULL), 00192 m_nd(NULL), 00193 m_so(NULL), 00194 m_se(NULL), 00195 m_ti(NULL), 00196 m_te(NULL), 00197 m_hashstrings(NULL), 00198 m_hashtbl(NULL), 00199 m_pflaglist(NULL), 00200 m_sflaglist(NULL), 00201 m_chartypes(NULL), 00202 m_infile(NULL), 00203 m_outfile(NULL), 00204 m_askfilename(NULL), 00205 m_Trynum(0), 00206 m_translate_in(0) 00207 { 00208 memset(m_sflagindex,0,sizeof(m_sflagindex)); 00209 memset(m_pflagindex,0,sizeof(m_pflagindex)); 00210 } 00211 00212 #ifndef FREEP 00213 #define FREEP(p) do { if (p) free(p); } while (0) 00214 #endif 00215 00216 ISpellChecker::~ISpellChecker() 00217 { 00218 if (m_bSuccessfulInit) { 00219 // only cleanup our mess if we were successfully initialized 00220 00221 clearindex (m_pflagindex); 00222 clearindex (m_sflagindex); 00223 } 00224 00225 FREEP(m_hashtbl); 00226 FREEP(m_hashstrings); 00227 FREEP(m_sflaglist); 00228 FREEP(m_chartypes); 00229 00230 delete m_translate_in; 00231 m_translate_in = 0; 00232 } 00233 00234 bool 00235 ISpellChecker::checkWord( const TQString& utf8Word ) 00236 { 00237 ichar_t iWord[INPUTWORDLEN + MAXAFFIXLEN]; 00238 if (!m_bSuccessfulInit) 00239 return false; 00240 00241 if (!utf8Word || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) || utf8Word.isEmpty()) 00242 return false; 00243 00244 bool retVal = false; 00245 TQCString out; 00246 if (!m_translate_in) 00247 return false; 00248 else { 00249 /* convert to 8bit string and null terminate */ 00250 int len_out = utf8Word.length(); 00251 00252 out = m_translate_in->fromUnicode( utf8Word, len_out ); 00253 } 00254 00255 if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0)) 00256 { 00257 if (good(iWord, 0, 0, 1, 0) == 1 || 00258 compoundgood(iWord, 1) == 1) 00259 { 00260 retVal = true; 00261 } 00262 } 00263 00264 return retVal; 00265 } 00266 00267 TQStringList 00268 ISpellChecker::suggestWord(const TQString& utf8Word) 00269 { 00270 ichar_t iWord[INPUTWORDLEN + MAXAFFIXLEN]; 00271 int c; 00272 00273 if (!m_bSuccessfulInit) 00274 return TQStringList(); 00275 00276 if (utf8Word.isEmpty() || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) || 00277 utf8Word.length() == 0) 00278 return TQStringList(); 00279 00280 TQCString out; 00281 if (!m_translate_in) 00282 return TQStringList(); 00283 else 00284 { 00285 /* convert to 8bit string and null terminate */ 00286 00287 int len_out = utf8Word.length(); 00288 out = m_translate_in->fromUnicode( utf8Word, len_out ); 00289 } 00290 00291 if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0)) 00292 makepossibilities(iWord); 00293 else 00294 return TQStringList(); 00295 00296 TQStringList sugg_arr; 00297 for (c = 0; c < m_pcount; c++) 00298 { 00299 TQString utf8Word; 00300 00301 if (!m_translate_in) 00302 { 00303 /* copy to 8bit string and null terminate */ 00304 utf8Word = TQString::fromUtf8( m_possibilities[c] ); 00305 } 00306 else 00307 { 00308 /* convert to 32bit string and null terminate */ 00309 utf8Word = m_translate_in->toUnicode( m_possibilities[c] ); 00310 } 00311 00312 sugg_arr.append( utf8Word ); 00313 } 00314 00315 return sugg_arr; 00316 } 00317 00318 static void 00319 s_buildHashNames (std::vector<std::string> & names, const char * dict) 00320 { 00321 const char * tmp = 0; 00322 int i = 0; 00323 00324 names.clear (); 00325 00326 while ( (tmp = ispell_dirs[i++]) ) { 00327 TQCString maybeFile = TQCString( tmp ) + '/'; 00328 maybeFile += dict; 00329 names.push_back( maybeFile.data() ); 00330 } 00331 } 00332 00333 static void 00334 s_allDics() 00335 { 00336 const char * tmp = 0; 00337 int i = 0; 00338 00339 while ( (tmp = ispell_dirs[i++]) ) { 00340 TQDir dir( tmp ); 00341 TQStringList lst = dir.entryList( "*.hash" ); 00342 for ( TQStringList::Iterator it = lst.begin(); it != lst.end(); ++it ) { 00343 TQFileInfo info( *it ); 00344 for (size_t i = 0; i < size_ispell_map; i++) 00345 { 00346 const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i])); 00347 if (!strcmp (info.fileName().latin1(), mapping->dict)) 00348 { 00349 ispell_dict_map.insert( mapping->lang, *it ); 00350 } 00351 } 00352 } 00353 } 00354 } 00355 00356 TQValueList<TQString> 00357 ISpellChecker::allDics() 00358 { 00359 if ( ispell_dict_map.empty() ) 00360 s_allDics(); 00361 00362 return ispell_dict_map.keys(); 00363 } 00364 00365 TQString 00366 ISpellChecker::loadDictionary (const char * szdict) 00367 { 00368 std::vector<std::string> dict_names; 00369 00370 s_buildHashNames (dict_names, szdict); 00371 00372 for (size_t i = 0; i < dict_names.size(); i++) 00373 { 00374 if (linit(const_cast<char*>(dict_names[i].c_str())) >= 0) 00375 return dict_names[i].c_str(); 00376 } 00377 00378 return TQString::null; 00379 } 00380 00387 bool 00388 ISpellChecker::loadDictionaryForLanguage ( const char * szLang ) 00389 { 00390 TQString hashname; 00391 00392 const char * encoding = NULL; 00393 const char * szFile = NULL; 00394 00395 for (size_t i = 0; i < size_ispell_map; i++) 00396 { 00397 const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i])); 00398 if (!strcmp (szLang, mapping->lang)) 00399 { 00400 szFile = mapping->dict; 00401 encoding = mapping->enc; 00402 break; 00403 } 00404 } 00405 00406 if (!szFile || !strlen(szFile)) 00407 return false; 00408 00409 alloc_ispell_struct(); 00410 00411 hashname = loadDictionary(szFile); 00412 if (hashname.isEmpty()) 00413 return false; 00414 00415 // one of the two above calls succeeded 00416 setDictionaryEncoding (hashname, encoding); 00417 00418 return true; 00419 } 00420 00421 void 00422 ISpellChecker::setDictionaryEncoding( const TQString& hashname, const char * encoding ) 00423 { 00424 /* Get Hash encoding from XML file. This should always work! */ 00425 try_autodetect_charset(encoding); 00426 00427 if (m_translate_in) 00428 { 00429 /* We still have to setup prefstringchar*/ 00430 prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag 00431 : static_cast<int *>(NULL)); 00432 00433 if (prefstringchar < 0) 00434 { 00435 std::string teststring; 00436 for(int n1 = 1; n1 <= 15; n1++) 00437 { 00438 teststring = "latin" + n1; 00439 prefstringchar = findfiletype(teststring.c_str(), 1, 00440 deftflag < 0 ? &deftflag : static_cast<int *>(NULL)); 00441 if (prefstringchar >= 0) 00442 break; 00443 } 00444 } 00445 00446 return; /* success */ 00447 } 00448 00449 /* Test for UTF-8 first */ 00450 prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag : static_cast<int *>(NULL)); 00451 if (prefstringchar >= 0) 00452 { 00453 m_translate_in = TQTextCodec::codecForName("utf8"); 00454 } 00455 00456 if (m_translate_in) 00457 return; /* success */ 00458 00459 /* Test for "latinN" */ 00460 if (!m_translate_in) 00461 { 00462 /* Look for "altstringtype" names from latin1 to latin15 */ 00463 for(int n1 = 1; n1 <= 15; n1++) 00464 { 00465 TQString teststring = TQString("latin%1").arg(n1); 00466 prefstringchar = findfiletype(teststring.latin1(), 1, 00467 deftflag < 0 ? &deftflag : static_cast<int *>(NULL)); 00468 if (prefstringchar >= 0) 00469 { 00470 //FIXME: latin1 might be wrong 00471 m_translate_in = TQTextCodec::codecForName( teststring.latin1() ); 00472 break; 00473 } 00474 } 00475 } 00476 00477 /* If nothing found, use latin1 */ 00478 if (!m_translate_in) 00479 { 00480 m_translate_in = TQTextCodec::codecForName("latin1"); 00481 } 00482 } 00483 00484 bool 00485 ISpellChecker::requestDictionary(const char *szLang) 00486 { 00487 if (!loadDictionaryForLanguage (szLang)) 00488 { 00489 // handle a shortened version of the language tag: en_US => en 00490 std::string shortened_dict (szLang); 00491 size_t uscore_pos; 00492 00493 if ((uscore_pos = shortened_dict.rfind ('_')) != ((size_t)-1)) { 00494 shortened_dict = shortened_dict.substr(0, uscore_pos); 00495 if (!loadDictionaryForLanguage (shortened_dict.c_str())) 00496 return false; 00497 } else 00498 return false; 00499 } 00500 00501 m_bSuccessfulInit = true; 00502 00503 if (prefstringchar < 0) 00504 m_defdupchar = 0; 00505 else 00506 m_defdupchar = prefstringchar; 00507 00508 return true; 00509 }