ispell_checker.cpp
00001 /* vim: set sw=8: -*- Mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ 00002 /* kspell2 - adopted from Enchant 00003 * Copyright (C) 2003 Dom Lachowicz 00004 * Copyright (C) 2004 Zack Rusin <zack@kde.org> 00005 * 00006 * This library is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2.1 of the License, or (at your option) any later version. 00010 * 00011 * This library is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with this library; if not, write to the 00018 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 00019 * Boston, MA 02110-1301, USA. 00020 * 00021 * In addition, as a special exception, Dom Lachowicz 00022 * gives permission to link the code of this program with 00023 * non-LGPL Spelling Provider libraries (eg: a MSFT Office 00024 * spell checker backend) and distribute linked combinations including 00025 * the two. You must obey the GNU Lesser General Public License in all 00026 * respects for all of the code used other than said providers. If you modify 00027 * this file, you may extend this exception to your version of the 00028 * file, but you are not obligated to do so. If you do not wish to 00029 * do so, delete this exception statement from your version. 00030 */ 00031 00032 #include <config.h> 00033 00034 #include <stdio.h> 00035 #include <stdlib.h> 00036 #include <string.h> 00037 00038 #include <string> 00039 #include <vector> 00040 00041 #include "sp_spell.h" 00042 #include "ispell_checker.h" 00043 00044 #include <tqmap.h> 00045 #include <tqdir.h> 00046 #include <tqfileinfo.h> 00047 00048 /***************************************************************************/ 00049 00050 typedef struct str_ispell_map 00051 { 00052 const char * lang; 00053 const char * dict; 00054 const char * enc; 00055 } IspellMap; 00056 00057 static const char *ispell_dirs [] = { 00058 #ifdef ISPELL_LIBDIR 00059 ISPELL_LIBDIR, 00060 #else 00061 "/usr/" SYSTEM_LIBDIR "/ispell", 00062 "/usr/lib/ispell", 00063 "/usr/local/" SYSTEM_LIBDIR "/ispell", 00064 "/usr/local/lib/ispell", 00065 "/usr/local/share/ispell", 00066 "/usr/share/ispell", 00067 "/usr/pkg/lib", 00068 #endif 00069 0 00070 }; 00071 static const IspellMap ispell_map [] = { 00072 {"ca" ,"catala.hash" ,"iso-8859-1" }, 00073 {"ca_ES" ,"catala.hash" ,"iso-8859-1" }, 00074 {"cs" ,"czech.hash" ,"iso-8859-2" }, 00075 {"cs_CZ" ,"czech.hash" ,"iso-8859-2" }, 00076 {"da" ,"dansk.hash" ,"iso-8859-1" }, 00077 {"da_DK" ,"dansk.hash" ,"iso-8859-1" }, 00078 {"de" ,"deutsch.hash" ,"iso-8859-1" }, 00079 {"de_CH" ,"swiss.hash" ,"iso-8859-1" }, 00080 {"de_AT" ,"deutsch.hash" ,"iso-8859-1" }, 00081 {"de_DE" ,"deutsch.hash" ,"iso-8859-1" }, 00082 {"el" ,"ellhnika.hash" ,"iso-8859-7" }, 00083 {"el_GR" ,"ellhnika.hash" ,"iso-8859-7" }, 00084 {"en" ,"british.hash" ,"iso-8859-1" }, 00085 {"en_AU" ,"british.hash" ,"iso-8859-1" }, 00086 {"en_BZ" ,"british.hash" ,"iso-8859-1" }, 00087 {"en_CA" ,"british.hash" ,"iso-8859-1" }, 00088 {"en_GB" ,"british.hash" ,"iso-8859-1" }, 00089 {"en_IE" ,"british.hash" ,"iso-8859-1" }, 00090 {"en_JM" ,"british.hash" ,"iso-8859-1" }, 00091 {"en_NZ" ,"british.hash" ,"iso-8859-1" }, 00092 {"en_TT" ,"british.hash" ,"iso-8859-1" }, 00093 {"en_ZA" ,"british.hash" ,"iso-8859-1" }, 00094 {"en_ZW" ,"british.hash" ,"iso-8859-1" }, 00095 {"en_PH" ,"american.hash" ,"iso-8859-1" }, 00096 {"en_US" ,"american.hash" ,"iso-8859-1" }, 00097 {"eo" ,"esperanto.hash" ,"iso-8859-3" }, 00098 {"es" ,"espanol.hash" ,"iso-8859-1" }, 00099 {"es_AR" ,"espanol.hash" ,"iso-8859-1" }, 00100 {"es_BO" ,"espanol.hash" ,"iso-8859-1" }, 00101 {"es_CL" ,"espanol.hash" ,"iso-8859-1" }, 00102 {"es_CO" ,"espanol.hash" ,"iso-8859-1" }, 00103 {"es_CR" ,"espanol.hash" ,"iso-8859-1" }, 00104 {"es_DO" ,"espanol.hash" ,"iso-8859-1" }, 00105 {"es_EC" ,"espanol.hash" ,"iso-8859-1" }, 00106 {"es_ES" ,"espanol.hash" ,"iso-8859-1" }, 00107 {"es_GT" ,"espanol.hash" ,"iso-8859-1" }, 00108 {"es_HN" ,"espanol.hash" ,"iso-8859-1" }, 00109 {"es_MX" ,"espanol.hash" ,"iso-8859-1" }, 00110 {"es_NI" ,"espanol.hash" ,"iso-8859-1" }, 00111 {"es_PA" ,"espanol.hash" ,"iso-8859-1" }, 00112 {"es_PE" ,"espanol.hash" ,"iso-8859-1" }, 00113 {"es_PR" ,"espanol.hash" ,"iso-8859-1" }, 00114 {"es_PY" ,"espanol.hash" ,"iso-8859-1" }, 00115 {"es_SV" ,"espanol.hash" ,"iso-8859-1" }, 00116 {"es_UY" ,"espanol.hash" ,"iso-8859-1" }, 00117 {"es_VE" ,"espanol.hash" ,"iso-8859-1" }, 00118 {"fi" ,"finnish.hash" ,"iso-8859-1" }, 00119 {"fi_FI" ,"finnish.hash" ,"iso-8859-1" }, 00120 {"fr" ,"francais.hash" ,"iso-8859-1" }, 00121 {"fr_BE" ,"francais.hash" ,"iso-8859-1" }, 00122 {"fr_CA" ,"francais.hash" ,"iso-8859-1" }, 00123 {"fr_CH" ,"francais.hash" ,"iso-8859-1" }, 00124 {"fr_FR" ,"francais.hash" ,"iso-8859-1" }, 00125 {"fr_LU" ,"francais.hash" ,"iso-8859-1" }, 00126 {"fr_MC" ,"francais.hash" ,"iso-8859-1" }, 00127 {"hu" ,"hungarian.hash" ,"iso-8859-2" }, 00128 {"hu_HU" ,"hungarian.hash" ,"iso-8859-2" }, 00129 {"ga" ,"irish.hash" ,"iso-8859-1" }, 00130 {"ga_IE" ,"irish.hash" ,"iso-8859-1" }, 00131 {"gl" ,"galician.hash" ,"iso-8859-1" }, 00132 {"gl_ES" ,"galician.hash" ,"iso-8859-1" }, 00133 {"ia" ,"interlingua.hash" ,"iso-8859-1" }, 00134 {"it" ,"italian.hash" ,"iso-8859-1" }, 00135 {"it_IT" ,"italian.hash" ,"iso-8859-1" }, 00136 {"it_CH" ,"italian.hash" ,"iso-8859-1" }, 00137 {"la" ,"mlatin.hash" ,"iso-8859-1" }, 00138 {"la_IT" ,"mlatin.hash" ,"iso-8859-1" }, 00139 {"lt" ,"lietuviu.hash" ,"iso-8859-13" }, 00140 {"lt_LT" ,"lietuviu.hash" ,"iso-8859-13" }, 00141 {"nl" ,"nederlands.hash" ,"iso-8859-1" }, 00142 {"nl_NL" ,"nederlands.hash" ,"iso-8859-1" }, 00143 {"nl_BE" ,"nederlands.hash" ,"iso-8859-1" }, 00144 {"nb" ,"norsk.hash" ,"iso-8859-1" }, 00145 {"nb_NO" ,"norsk.hash" ,"iso-8859-1" }, 00146 {"nn" ,"nynorsk.hash" ,"iso-8859-1" }, 00147 {"nn_NO" ,"nynorsk.hash" ,"iso-8859-1" }, 00148 {"no" ,"norsk.hash" ,"iso-8859-1" }, 00149 {"no_NO" ,"norsk.hash" ,"iso-8859-1" }, 00150 {"pl" ,"polish.hash" ,"iso-8859-2" }, 00151 {"pl_PL" ,"polish.hash" ,"iso-8859-2" }, 00152 {"pt" ,"brazilian.hash" ,"iso-8859-1" }, 00153 {"pt_BR" ,"brazilian.hash" ,"iso-8859-1" }, 00154 {"pt_PT" ,"portugues.hash" ,"iso-8859-1" }, 00155 {"ru" ,"russian.hash" ,"koi8-r" }, 00156 {"ru_MD" ,"russian.hash" ,"koi8-r" }, 00157 {"ru_RU" ,"russian.hash" ,"koi8-r" }, 00158 {"sc" ,"sardinian.hash" ,"iso-8859-1" }, 00159 {"sc_IT" ,"sardinian.hash" ,"iso-8859-1" }, 00160 {"sk" ,"slovak.hash" ,"iso-8859-2" }, 00161 {"sk_SK" ,"slovak.hash" ,"iso-8859-2" }, 00162 {"sl" ,"slovensko.hash" ,"iso-8859-2" }, 00163 {"sl_SI" ,"slovensko.hash" ,"iso-8859-2" }, 00164 {"sv" ,"svenska.hash" ,"iso-8859-1" }, 00165 {"sv_SE" ,"svenska.hash" ,"iso-8859-1" }, 00166 {"uk" ,"ukrainian.hash" ,"koi8-u" }, 00167 {"uk_UA" ,"ukrainian.hash" ,"koi8-u" }, 00168 {"yi" ,"yiddish-yivo.hash" ,"utf-8" } 00169 }; 00170 00171 static const size_t size_ispell_map = ( sizeof(ispell_map) / sizeof((ispell_map)[0]) ); 00172 static TQMap<TQString, TQString> ispell_dict_map; 00173 00174 00175 void 00176 ISpellChecker::try_autodetect_charset(const char * const inEncoding) 00177 { 00178 if (inEncoding && strlen(inEncoding)) 00179 { 00180 m_translate_in = TQTextCodec::codecForName(inEncoding); 00181 } 00182 } 00183 00184 /***************************************************************************/ 00185 /***************************************************************************/ 00186 00187 ISpellChecker::ISpellChecker() 00188 : deftflag(-1), 00189 prefstringchar(-1), 00190 m_bSuccessfulInit(false), 00191 m_BC(NULL), 00192 m_cd(NULL), 00193 m_cl(NULL), 00194 m_cm(NULL), 00195 m_ho(NULL), 00196 m_nd(NULL), 00197 m_so(NULL), 00198 m_se(NULL), 00199 m_ti(NULL), 00200 m_te(NULL), 00201 m_hashstrings(NULL), 00202 m_hashtbl(NULL), 00203 m_pflaglist(NULL), 00204 m_sflaglist(NULL), 00205 m_chartypes(NULL), 00206 m_infile(NULL), 00207 m_outfile(NULL), 00208 m_askfilename(NULL), 00209 m_Trynum(0), 00210 m_translate_in(0) 00211 { 00212 memset(m_sflagindex,0,sizeof(m_sflagindex)); 00213 memset(m_pflagindex,0,sizeof(m_pflagindex)); 00214 } 00215 00216 #ifndef FREEP 00217 #define FREEP(p) do { if (p) free(p); } while (0) 00218 #endif 00219 00220 ISpellChecker::~ISpellChecker() 00221 { 00222 if (m_bSuccessfulInit) { 00223 // only cleanup our mess if we were successfully initialized 00224 00225 clearindex (m_pflagindex); 00226 clearindex (m_sflagindex); 00227 } 00228 00229 FREEP(m_hashtbl); 00230 FREEP(m_hashstrings); 00231 FREEP(m_sflaglist); 00232 FREEP(m_chartypes); 00233 00234 delete m_translate_in; 00235 m_translate_in = 0; 00236 } 00237 00238 bool 00239 ISpellChecker::checkWord( const TQString& utf8Word ) 00240 { 00241 ichar_t iWord[INPUTWORDLEN + MAXAFFIXLEN]; 00242 if (!m_bSuccessfulInit) 00243 return false; 00244 00245 if (!utf8Word || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) || utf8Word.isEmpty()) 00246 return false; 00247 00248 bool retVal = false; 00249 TQCString out; 00250 if (!m_translate_in) 00251 return false; 00252 else { 00253 /* convert to 8bit string and null terminate */ 00254 int len_out = utf8Word.length(); 00255 00256 out = m_translate_in->fromUnicode( utf8Word, len_out ); 00257 } 00258 00259 if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0)) 00260 { 00261 if (good(iWord, 0, 0, 1, 0) == 1 || 00262 compoundgood(iWord, 1) == 1) 00263 { 00264 retVal = true; 00265 } 00266 } 00267 00268 return retVal; 00269 } 00270 00271 TQStringList 00272 ISpellChecker::suggestWord(const TQString& utf8Word) 00273 { 00274 ichar_t iWord[INPUTWORDLEN + MAXAFFIXLEN]; 00275 int c; 00276 00277 if (!m_bSuccessfulInit) 00278 return TQStringList(); 00279 00280 if (utf8Word.isEmpty() || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) || 00281 utf8Word.length() == 0) 00282 return TQStringList(); 00283 00284 TQCString out; 00285 if (!m_translate_in) 00286 return TQStringList(); 00287 else 00288 { 00289 /* convert to 8bit string and null terminate */ 00290 00291 int len_out = utf8Word.length(); 00292 out = m_translate_in->fromUnicode( utf8Word, len_out ); 00293 } 00294 00295 if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0)) 00296 makepossibilities(iWord); 00297 else 00298 return TQStringList(); 00299 00300 TQStringList sugg_arr; 00301 for (c = 0; c < m_pcount; c++) 00302 { 00303 TQString utf8Word; 00304 00305 if (!m_translate_in) 00306 { 00307 /* copy to 8bit string and null terminate */ 00308 utf8Word = TQString::fromUtf8( m_possibilities[c] ); 00309 } 00310 else 00311 { 00312 /* convert to 32bit string and null terminate */ 00313 utf8Word = m_translate_in->toUnicode( m_possibilities[c] ); 00314 } 00315 00316 sugg_arr.append( utf8Word ); 00317 } 00318 00319 return sugg_arr; 00320 } 00321 00322 static void 00323 s_buildHashNames (std::vector<std::string> & names, const char * dict) 00324 { 00325 const char * tmp = 0; 00326 int i = 0; 00327 00328 names.clear (); 00329 00330 while ( (tmp = ispell_dirs[i++]) ) { 00331 TQCString maybeFile = TQCString( tmp ) + '/'; 00332 maybeFile += dict; 00333 names.push_back( maybeFile.data() ); 00334 } 00335 } 00336 00337 static void 00338 s_allDics() 00339 { 00340 const char * tmp = 0; 00341 int i = 0; 00342 00343 while ( (tmp = ispell_dirs[i++]) ) { 00344 TQDir dir( tmp ); 00345 TQStringList lst = dir.entryList( "*.hash" ); 00346 for ( TQStringList::Iterator it = lst.begin(); it != lst.end(); ++it ) { 00347 TQFileInfo info( *it ); 00348 for (size_t i = 0; i < size_ispell_map; i++) 00349 { 00350 const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i])); 00351 if (!strcmp (info.fileName().latin1(), mapping->dict)) 00352 { 00353 ispell_dict_map.insert( mapping->lang, *it ); 00354 } 00355 } 00356 } 00357 } 00358 } 00359 00360 TQValueList<TQString> 00361 ISpellChecker::allDics() 00362 { 00363 if ( ispell_dict_map.empty() ) 00364 s_allDics(); 00365 00366 return ispell_dict_map.keys(); 00367 } 00368 00369 TQString 00370 ISpellChecker::loadDictionary (const char * szdict) 00371 { 00372 std::vector<std::string> dict_names; 00373 00374 s_buildHashNames (dict_names, szdict); 00375 00376 for (size_t i = 0; i < dict_names.size(); i++) 00377 { 00378 if (linit(const_cast<char*>(dict_names[i].c_str())) >= 0) 00379 return dict_names[i].c_str(); 00380 } 00381 00382 return TQString::null; 00383 } 00384 00391 bool 00392 ISpellChecker::loadDictionaryForLanguage ( const char * szLang ) 00393 { 00394 TQString hashname; 00395 00396 const char * encoding = NULL; 00397 const char * szFile = NULL; 00398 00399 for (size_t i = 0; i < size_ispell_map; i++) 00400 { 00401 const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i])); 00402 if (!strcmp (szLang, mapping->lang)) 00403 { 00404 szFile = mapping->dict; 00405 encoding = mapping->enc; 00406 break; 00407 } 00408 } 00409 00410 if (!szFile || !strlen(szFile)) 00411 return false; 00412 00413 alloc_ispell_struct(); 00414 00415 hashname = loadDictionary(szFile); 00416 if (hashname.isEmpty()) 00417 return false; 00418 00419 // one of the two above calls succeeded 00420 setDictionaryEncoding (hashname, encoding); 00421 00422 return true; 00423 } 00424 00425 void 00426 ISpellChecker::setDictionaryEncoding( const TQString& hashname, const char * encoding ) 00427 { 00428 /* Get Hash encoding from XML file. This should always work! */ 00429 try_autodetect_charset(encoding); 00430 00431 if (m_translate_in) 00432 { 00433 /* We still have to setup prefstringchar*/ 00434 prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag 00435 : static_cast<int *>(NULL)); 00436 00437 if (prefstringchar < 0) 00438 { 00439 std::string teststring; 00440 for(int n1 = 1; n1 <= 15; n1++) 00441 { 00442 teststring = "latin" + n1; 00443 prefstringchar = findfiletype(teststring.c_str(), 1, 00444 deftflag < 0 ? &deftflag : static_cast<int *>(NULL)); 00445 if (prefstringchar >= 0) 00446 break; 00447 } 00448 } 00449 00450 return; /* success */ 00451 } 00452 00453 /* Test for UTF-8 first */ 00454 prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag : static_cast<int *>(NULL)); 00455 if (prefstringchar >= 0) 00456 { 00457 m_translate_in = TQTextCodec::codecForName("utf8"); 00458 } 00459 00460 if (m_translate_in) 00461 return; /* success */ 00462 00463 /* Test for "latinN" */ 00464 if (!m_translate_in) 00465 { 00466 /* Look for "altstringtype" names from latin1 to latin15 */ 00467 for(int n1 = 1; n1 <= 15; n1++) 00468 { 00469 TQString teststring = TQString("latin%1").arg(n1); 00470 prefstringchar = findfiletype(teststring.latin1(), 1, 00471 deftflag < 0 ? &deftflag : static_cast<int *>(NULL)); 00472 if (prefstringchar >= 0) 00473 { 00474 //FIXME: latin1 might be wrong 00475 m_translate_in = TQTextCodec::codecForName( teststring.latin1() ); 00476 break; 00477 } 00478 } 00479 } 00480 00481 /* If nothing found, use latin1 */ 00482 if (!m_translate_in) 00483 { 00484 m_translate_in = TQTextCodec::codecForName("latin1"); 00485 } 00486 } 00487 00488 bool 00489 ISpellChecker::requestDictionary(const char *szLang) 00490 { 00491 if (!loadDictionaryForLanguage (szLang)) 00492 { 00493 // handle a shortened version of the language tag: en_US => en 00494 std::string shortened_dict (szLang); 00495 size_t uscore_pos; 00496 00497 if ((uscore_pos = shortened_dict.rfind ('_')) != ((size_t)-1)) { 00498 shortened_dict = shortened_dict.substr(0, uscore_pos); 00499 if (!loadDictionaryForLanguage (shortened_dict.c_str())) 00500 return false; 00501 } else 00502 return false; 00503 } 00504 00505 m_bSuccessfulInit = true; 00506 00507 if (prefstringchar < 0) 00508 m_defdupchar = 0; 00509 else 00510 m_defdupchar = prefstringchar; 00511 00512 return true; 00513 }