• Skip to content
  • Skip to link menu
Trinity API Reference
  • Trinity API Reference
  • kspell2
 

kspell2

ispell_checker.cpp
00001 /* vim: set sw=8: -*- Mode: C++; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
00002 /* kspell2 - adopted from Enchant
00003  * Copyright (C) 2003 Dom Lachowicz
00004  * Copyright (C) 2004 Zack Rusin <zack@kde.org>
00005  *
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with this library; if not, write to the
00018  * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
00019  * Boston, MA 02110-1301, USA.
00020  *
00021  * In addition, as a special exception, Dom Lachowicz
00022  * gives permission to link the code of this program with
00023  * non-LGPL Spelling Provider libraries (eg: a MSFT Office
00024  * spell checker backend) and distribute linked combinations including
00025  * the two.  You must obey the GNU Lesser General Public License in all
00026  * respects for all of the code used other than said providers.  If you modify
00027  * this file, you may extend this exception to your version of the
00028  * file, but you are not obligated to do so.  If you do not wish to
00029  * do so, delete this exception statement from your version.
00030  */
00031 
00032 #include <config.h>
00033 
00034 #include <stdio.h>
00035 #include <stdlib.h>
00036 #include <string.h>
00037 
00038 #include <string>
00039 #include <vector>
00040 
00041 #include "sp_spell.h"
00042 #include "ispell_checker.h"
00043 
00044 #include <tqmap.h>
00045 #include <tqdir.h>
00046 #include <tqfileinfo.h>
00047 
00048 /***************************************************************************/
00049 
00050 typedef struct str_ispell_map
00051 {
00052     const char * lang;
00053     const char * dict;
00054     const char * enc;
00055 } IspellMap;
00056 
00057 static const char *ispell_dirs [] = {
00058 #ifdef ISPELL_LIBDIR
00059     ISPELL_LIBDIR,
00060 #else
00061     "/usr/" SYSTEM_LIBDIR "/ispell",
00062     "/usr/lib/ispell",
00063     "/usr/local/" SYSTEM_LIBDIR "/ispell",
00064     "/usr/local/lib/ispell",
00065     "/usr/local/share/ispell",
00066     "/usr/share/ispell",
00067     "/usr/pkg/lib",
00068 #endif
00069     0
00070 };
00071 static const IspellMap ispell_map [] = {
00072     {"ca"    ,"catala.hash"         ,"iso-8859-1" },
00073     {"ca_ES" ,"catala.hash"         ,"iso-8859-1" },
00074     {"cs"    ,"czech.hash"          ,"iso-8859-2" },
00075     {"cs_CZ" ,"czech.hash"          ,"iso-8859-2" },
00076     {"da"    ,"dansk.hash"          ,"iso-8859-1" },
00077     {"da_DK" ,"dansk.hash"          ,"iso-8859-1" },
00078     {"de"    ,"deutsch.hash"        ,"iso-8859-1" },
00079     {"de_CH" ,"swiss.hash"          ,"iso-8859-1" },
00080     {"de_AT" ,"deutsch.hash"        ,"iso-8859-1" },
00081     {"de_DE" ,"deutsch.hash"        ,"iso-8859-1" },
00082     {"el"    ,"ellhnika.hash"       ,"iso-8859-7" },
00083     {"el_GR" ,"ellhnika.hash"       ,"iso-8859-7" },
00084     {"en"    ,"british.hash"        ,"iso-8859-1" },
00085     {"en_AU" ,"british.hash"        ,"iso-8859-1" },
00086     {"en_BZ" ,"british.hash"        ,"iso-8859-1" },
00087     {"en_CA" ,"british.hash"        ,"iso-8859-1" },
00088     {"en_GB" ,"british.hash"        ,"iso-8859-1" },
00089     {"en_IE" ,"british.hash"        ,"iso-8859-1" },
00090     {"en_JM" ,"british.hash"        ,"iso-8859-1" },
00091     {"en_NZ" ,"british.hash"        ,"iso-8859-1" },
00092     {"en_TT" ,"british.hash"        ,"iso-8859-1" },
00093     {"en_ZA" ,"british.hash"        ,"iso-8859-1" },
00094     {"en_ZW" ,"british.hash"        ,"iso-8859-1" },
00095     {"en_PH" ,"american.hash"       ,"iso-8859-1" },
00096     {"en_US" ,"american.hash"       ,"iso-8859-1" },
00097     {"eo"    ,"esperanto.hash"      ,"iso-8859-3" },
00098     {"es"    ,"espanol.hash"        ,"iso-8859-1" },
00099     {"es_AR" ,"espanol.hash"        ,"iso-8859-1" },
00100     {"es_BO" ,"espanol.hash"        ,"iso-8859-1" },
00101     {"es_CL" ,"espanol.hash"        ,"iso-8859-1" },
00102     {"es_CO" ,"espanol.hash"        ,"iso-8859-1" },
00103     {"es_CR" ,"espanol.hash"        ,"iso-8859-1" },
00104     {"es_DO" ,"espanol.hash"        ,"iso-8859-1" },
00105     {"es_EC" ,"espanol.hash"        ,"iso-8859-1" },
00106     {"es_ES" ,"espanol.hash"        ,"iso-8859-1" },
00107     {"es_GT" ,"espanol.hash"        ,"iso-8859-1" },
00108     {"es_HN" ,"espanol.hash"        ,"iso-8859-1" },
00109     {"es_MX" ,"espanol.hash"        ,"iso-8859-1" },
00110     {"es_NI" ,"espanol.hash"        ,"iso-8859-1" },
00111     {"es_PA" ,"espanol.hash"        ,"iso-8859-1" },
00112     {"es_PE" ,"espanol.hash"        ,"iso-8859-1" },
00113     {"es_PR" ,"espanol.hash"        ,"iso-8859-1" },
00114     {"es_PY" ,"espanol.hash"        ,"iso-8859-1" },
00115     {"es_SV" ,"espanol.hash"        ,"iso-8859-1" },
00116     {"es_UY" ,"espanol.hash"        ,"iso-8859-1" },
00117     {"es_VE" ,"espanol.hash"        ,"iso-8859-1" },
00118     {"fi"    ,"finnish.hash"        ,"iso-8859-1" },
00119     {"fi_FI" ,"finnish.hash"        ,"iso-8859-1" },
00120     {"fr"    ,"francais.hash"       ,"iso-8859-1" },
00121     {"fr_BE" ,"francais.hash"       ,"iso-8859-1" },
00122     {"fr_CA" ,"francais.hash"       ,"iso-8859-1" },
00123     {"fr_CH" ,"francais.hash"       ,"iso-8859-1" },
00124     {"fr_FR" ,"francais.hash"       ,"iso-8859-1" },
00125     {"fr_LU" ,"francais.hash"       ,"iso-8859-1" },
00126     {"fr_MC" ,"francais.hash"       ,"iso-8859-1" },
00127     {"hu"    ,"hungarian.hash"      ,"iso-8859-2" },
00128     {"hu_HU" ,"hungarian.hash"      ,"iso-8859-2" },
00129     {"ga"    ,"irish.hash"          ,"iso-8859-1" },
00130     {"ga_IE" ,"irish.hash"          ,"iso-8859-1" },
00131     {"gl"    ,"galician.hash"       ,"iso-8859-1" },
00132     {"gl_ES" ,"galician.hash"       ,"iso-8859-1" },
00133     {"ia"    ,"interlingua.hash"    ,"iso-8859-1" },
00134     {"it"    ,"italian.hash"        ,"iso-8859-1" },
00135     {"it_IT" ,"italian.hash"        ,"iso-8859-1" },
00136     {"it_CH" ,"italian.hash"        ,"iso-8859-1" },
00137     {"la"    ,"mlatin.hash"         ,"iso-8859-1" },
00138     {"la_IT" ,"mlatin.hash"         ,"iso-8859-1" },
00139     {"lt"    ,"lietuviu.hash"       ,"iso-8859-13" },
00140     {"lt_LT" ,"lietuviu.hash"       ,"iso-8859-13" },
00141     {"nl"    ,"nederlands.hash"     ,"iso-8859-1" },
00142     {"nl_NL" ,"nederlands.hash"     ,"iso-8859-1" },
00143     {"nl_BE" ,"nederlands.hash"     ,"iso-8859-1" },
00144     {"nb"    ,"norsk.hash"          ,"iso-8859-1" },
00145     {"nb_NO" ,"norsk.hash"          ,"iso-8859-1" },
00146     {"nn"    ,"nynorsk.hash"        ,"iso-8859-1" },
00147     {"nn_NO" ,"nynorsk.hash"        ,"iso-8859-1" },
00148     {"no"    ,"norsk.hash"          ,"iso-8859-1" },
00149     {"no_NO" ,"norsk.hash"          ,"iso-8859-1" },
00150     {"pl"    ,"polish.hash"         ,"iso-8859-2" },
00151     {"pl_PL" ,"polish.hash"         ,"iso-8859-2" },
00152     {"pt"    ,"brazilian.hash"      ,"iso-8859-1" },
00153     {"pt_BR" ,"brazilian.hash"      ,"iso-8859-1" },
00154     {"pt_PT" ,"portugues.hash"      ,"iso-8859-1" },
00155     {"ru"    ,"russian.hash"        ,"koi8-r" },
00156     {"ru_MD" ,"russian.hash"        ,"koi8-r" },
00157     {"ru_RU" ,"russian.hash"        ,"koi8-r" },
00158     {"sc"    ,"sardinian.hash"      ,"iso-8859-1" },
00159     {"sc_IT" ,"sardinian.hash"      ,"iso-8859-1" },
00160     {"sk"    ,"slovak.hash"         ,"iso-8859-2" },
00161     {"sk_SK" ,"slovak.hash"         ,"iso-8859-2" },
00162     {"sl"    ,"slovensko.hash"      ,"iso-8859-2" },
00163     {"sl_SI" ,"slovensko.hash"      ,"iso-8859-2" },
00164     {"sv"    ,"svenska.hash"        ,"iso-8859-1" },
00165     {"sv_SE" ,"svenska.hash"        ,"iso-8859-1" },
00166     {"uk"    ,"ukrainian.hash"      ,"koi8-u" },
00167     {"uk_UA" ,"ukrainian.hash"      ,"koi8-u" },
00168     {"yi"    ,"yiddish-yivo.hash"   ,"utf-8" }
00169 };
00170 
00171 static const size_t size_ispell_map = ( sizeof(ispell_map) / sizeof((ispell_map)[0]) );
00172 static TQMap<TQString, TQString> ispell_dict_map;
00173 
00174 
00175 void
00176 ISpellChecker::try_autodetect_charset(const char * const inEncoding)
00177 {
00178     if (inEncoding && strlen(inEncoding))
00179         {
00180             m_translate_in = TQTextCodec::codecForName(inEncoding);
00181         }
00182 }
00183 
00184 /***************************************************************************/
00185 /***************************************************************************/
00186 
00187 ISpellChecker::ISpellChecker()
00188     : deftflag(-1),
00189      prefstringchar(-1),
00190      m_bSuccessfulInit(false),
00191      m_BC(NULL),
00192      m_cd(NULL),
00193      m_cl(NULL),
00194      m_cm(NULL),
00195      m_ho(NULL),
00196      m_nd(NULL),
00197      m_so(NULL),
00198      m_se(NULL),
00199      m_ti(NULL),
00200      m_te(NULL),
00201      m_hashstrings(NULL),
00202      m_hashtbl(NULL),
00203      m_pflaglist(NULL),
00204      m_sflaglist(NULL),
00205      m_chartypes(NULL),
00206      m_infile(NULL),
00207      m_outfile(NULL),
00208      m_askfilename(NULL),
00209      m_Trynum(0),
00210      m_translate_in(0)
00211 {
00212     memset(m_sflagindex,0,sizeof(m_sflagindex));
00213     memset(m_pflagindex,0,sizeof(m_pflagindex));
00214 }
00215 
00216 #ifndef FREEP
00217 #define FREEP(p)        do { if (p) free(p); } while (0)
00218 #endif
00219 
00220 ISpellChecker::~ISpellChecker()
00221 {
00222     if (m_bSuccessfulInit) {
00223         // only cleanup our mess if we were successfully initialized
00224 
00225         clearindex (m_pflagindex);
00226         clearindex (m_sflagindex);
00227     }
00228 
00229     FREEP(m_hashtbl);
00230     FREEP(m_hashstrings);
00231     FREEP(m_sflaglist);
00232     FREEP(m_chartypes);
00233 
00234     delete m_translate_in;
00235     m_translate_in = 0;
00236 }
00237 
00238 bool
00239 ISpellChecker::checkWord( const TQString& utf8Word )
00240 {
00241     ichar_t iWord[INPUTWORDLEN + MAXAFFIXLEN];
00242     if (!m_bSuccessfulInit)
00243         return false;
00244 
00245     if (!utf8Word || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) || utf8Word.isEmpty())
00246         return false;
00247 
00248     bool retVal = false;
00249     TQCString out;
00250     if (!m_translate_in)
00251         return false;
00252     else {
00253         /* convert to 8bit string and null terminate */
00254         int len_out = utf8Word.length();
00255 
00256         out = m_translate_in->fromUnicode( utf8Word, len_out );
00257     }
00258 
00259     if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0))
00260         {
00261             if (good(iWord, 0, 0, 1, 0) == 1 ||
00262                 compoundgood(iWord, 1) == 1)
00263                 {
00264                     retVal = true;
00265                 }
00266         }
00267 
00268     return retVal;
00269 }
00270 
00271 TQStringList
00272 ISpellChecker::suggestWord(const TQString& utf8Word)
00273 {
00274     ichar_t  iWord[INPUTWORDLEN + MAXAFFIXLEN];
00275     int  c;
00276 
00277     if (!m_bSuccessfulInit)
00278         return TQStringList();
00279 
00280     if (utf8Word.isEmpty() || utf8Word.length() >= (INPUTWORDLEN + MAXAFFIXLEN) ||
00281             utf8Word.length() == 0)
00282         return TQStringList();
00283 
00284     TQCString out;
00285     if (!m_translate_in)
00286         return TQStringList();
00287     else
00288         {
00289             /* convert to 8bit string and null terminate */
00290 
00291             int len_out = utf8Word.length();
00292             out = m_translate_in->fromUnicode( utf8Word, len_out );
00293         }
00294 
00295     if (!strtoichar(iWord, out.data(), INPUTWORDLEN + MAXAFFIXLEN, 0))
00296         makepossibilities(iWord);
00297     else
00298         return TQStringList();
00299 
00300     TQStringList sugg_arr;
00301     for (c = 0; c < m_pcount; c++)
00302     {
00303         TQString utf8Word;
00304 
00305         if (!m_translate_in)
00306         {
00307             /* copy to 8bit string and null terminate */
00308             utf8Word = TQString::fromUtf8( m_possibilities[c] );
00309         }
00310         else
00311         {
00312             /* convert to 32bit string and null terminate */
00313             utf8Word = m_translate_in->toUnicode( m_possibilities[c] );
00314         }
00315 
00316         sugg_arr.append( utf8Word );
00317     }
00318 
00319     return sugg_arr;
00320 }
00321 
00322 static void
00323 s_buildHashNames (std::vector<std::string> & names, const char * dict)
00324 {
00325     const char * tmp = 0;
00326     int i = 0;
00327 
00328     names.clear ();
00329 
00330     while ( (tmp = ispell_dirs[i++]) ) {
00331         TQCString maybeFile = TQCString( tmp ) + '/';
00332         maybeFile += dict;
00333         names.push_back( maybeFile.data() );
00334     }
00335 }
00336 
00337 static void
00338 s_allDics()
00339 {
00340     const char * tmp = 0;
00341     int i = 0;
00342 
00343     while ( (tmp = ispell_dirs[i++]) ) {
00344         TQDir dir( tmp );
00345         TQStringList lst = dir.entryList( "*.hash" );
00346         for ( TQStringList::Iterator it = lst.begin(); it != lst.end(); ++it ) {
00347             TQFileInfo info( *it );
00348             for (size_t i = 0; i < size_ispell_map; i++)
00349             {
00350                 const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i]));
00351                 if (!strcmp (info.fileName().latin1(), mapping->dict))
00352                 {
00353                     ispell_dict_map.insert( mapping->lang, *it );
00354                 }
00355             }
00356         }
00357     }
00358 }
00359 
00360 TQValueList<TQString>
00361 ISpellChecker::allDics()
00362 {
00363     if ( ispell_dict_map.empty() )
00364         s_allDics();
00365 
00366     return ispell_dict_map.keys();
00367 }
00368 
00369 TQString
00370 ISpellChecker::loadDictionary (const char * szdict)
00371 {
00372     std::vector<std::string> dict_names;
00373 
00374     s_buildHashNames (dict_names, szdict);
00375 
00376     for (size_t i = 0; i < dict_names.size(); i++)
00377         {
00378             if (linit(const_cast<char*>(dict_names[i].c_str())) >= 0)
00379                 return dict_names[i].c_str();
00380         }
00381 
00382     return TQString::null;
00383 }
00384 
00391 bool
00392 ISpellChecker::loadDictionaryForLanguage ( const char * szLang )
00393 {
00394     TQString hashname;
00395 
00396     const char * encoding = NULL;
00397     const char * szFile = NULL;
00398 
00399     for (size_t i = 0; i < size_ispell_map; i++)
00400         {
00401             const IspellMap * mapping = (const IspellMap *)(&(ispell_map[i]));
00402             if (!strcmp (szLang, mapping->lang))
00403                 {
00404                     szFile = mapping->dict;
00405                     encoding = mapping->enc;
00406                     break;
00407                 }
00408         }
00409 
00410     if (!szFile || !strlen(szFile))
00411         return false;
00412 
00413     alloc_ispell_struct();
00414 
00415     hashname = loadDictionary(szFile);
00416     if (hashname.isEmpty())
00417         return false;
00418 
00419     // one of the two above calls succeeded
00420     setDictionaryEncoding (hashname, encoding);
00421 
00422     return true;
00423 }
00424 
00425 void
00426 ISpellChecker::setDictionaryEncoding( const TQString& hashname, const char * encoding )
00427 {
00428     /* Get Hash encoding from XML file. This should always work! */
00429     try_autodetect_charset(encoding);
00430 
00431     if (m_translate_in)
00432         {
00433             /* We still have to setup prefstringchar*/
00434             prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag
00435                               : static_cast<int *>(NULL));
00436 
00437             if (prefstringchar < 0)
00438                 {
00439                     std::string teststring;
00440                     for(int n1 = 1; n1 <= 15; n1++)
00441                         {
00442                             teststring = "latin" + n1;
00443                             prefstringchar = findfiletype(teststring.c_str(), 1,
00444                                               deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
00445                             if (prefstringchar >= 0)
00446                                 break;
00447                         }
00448                 }
00449 
00450             return; /* success */
00451         }
00452 
00453     /* Test for UTF-8 first */
00454     prefstringchar = findfiletype("utf8", 1, deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
00455     if (prefstringchar >= 0)
00456         {
00457             m_translate_in = TQTextCodec::codecForName("utf8");
00458         }
00459 
00460     if (m_translate_in)
00461         return; /* success */
00462 
00463     /* Test for "latinN" */
00464     if (!m_translate_in)
00465         {
00466             /* Look for "altstringtype" names from latin1 to latin15 */
00467             for(int n1 = 1; n1 <= 15; n1++)
00468                 {
00469                     TQString teststring = TQString("latin%1").arg(n1);
00470                     prefstringchar = findfiletype(teststring.latin1(), 1,
00471                                       deftflag < 0 ? &deftflag : static_cast<int *>(NULL));
00472                     if (prefstringchar >= 0)
00473                         {
00474                             //FIXME: latin1 might be wrong
00475                             m_translate_in = TQTextCodec::codecForName( teststring.latin1() );
00476                             break;
00477                         }
00478                 }
00479         }
00480 
00481     /* If nothing found, use latin1 */
00482     if (!m_translate_in)
00483         {
00484             m_translate_in = TQTextCodec::codecForName("latin1");
00485         }
00486 }
00487 
00488 bool
00489 ISpellChecker::requestDictionary(const char *szLang)
00490 {
00491     if (!loadDictionaryForLanguage (szLang))
00492         {
00493             // handle a shortened version of the language tag: en_US => en
00494             std::string shortened_dict (szLang);
00495             size_t uscore_pos;
00496 
00497             if ((uscore_pos = shortened_dict.rfind ('_')) != ((size_t)-1)) {
00498                 shortened_dict = shortened_dict.substr(0, uscore_pos);
00499                 if (!loadDictionaryForLanguage (shortened_dict.c_str()))
00500                     return false;
00501             } else
00502                 return false;
00503         }
00504 
00505     m_bSuccessfulInit = true;
00506 
00507     if (prefstringchar < 0)
00508         m_defdupchar = 0;
00509     else
00510         m_defdupchar = prefstringchar;
00511 
00512     return true;
00513 }

kspell2

Skip menu "kspell2"
  • Main Page
  • Namespace List
  • Class Hierarchy
  • Alphabetical List
  • Class List
  • File List
  • Class Members

kspell2

Skip menu "kspell2"
  • arts
  • dcop
  • dnssd
  • interfaces
  •     interface
  •     library
  •   kspeech
  •   ktexteditor
  • kabc
  • kate
  • kcmshell
  • kdecore
  • kded
  • kdefx
  • kdeprint
  • kdesu
  • kdeui
  • kdoctools
  • khtml
  • kimgio
  • kinit
  • kio
  •   bookmarks
  •   httpfilter
  •   kfile
  •   kio
  •   kioexec
  •   kpasswdserver
  •   kssl
  • kioslave
  •   http
  • kjs
  • kmdi
  •   kmdi
  • knewstuff
  • kparts
  • krandr
  • kresources
  • kspell2
  • kunittest
  • kutils
  • kwallet
  • libkmid
  • libkscreensaver
Generated for kspell2 by doxygen 1.7.6.1
This website is maintained by Timothy Pearson.
KDE® and the K Desktop Environment® logo are registered trademarks of KDE e.V. |