libept
textsearch.h
Go to the documentation of this file.
00001 #ifndef EPT_TEXTSEARCH_TEXTSEARCH_H
00002 #define EPT_TEXTSEARCH_TEXTSEARCH_H
00003 
00009 /*
00010  * Copyright (C) 2007  Enrico Zini <enrico@debian.org>
00011  *
00012  * This program is free software; you can redistribute it and/or modify
00013  * it under the terms of the GNU General Public License as published by
00014  * the Free Software Foundation; either version 2 of the License, or
00015  * (at your option) any later version.
00016  *
00017  * This program is distributed in the hope that it will be useful,
00018  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00019  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00020  * GNU General Public License for more details.
00021  *
00022  * You should have received a copy of the GNU General Public License
00023  * along with this program; if not, write to the Free Software
00024  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00025  */
00026 
00027 #include <xapian.h>
00028 #include <vector>
00029 #include <string>
00030 
00031 namespace ept {
00032 namespace apt {
00033 class Apt;
00034 class PackageRecord;
00035 }
00036 namespace debtags {
00037 class Debtags;
00038 }
00039 namespace textsearch {
00040 
00041 // Allocate value indexes for known values
00042 const Xapian::valueno VAL_APT_INSTALLED_SIZE      =  1;
00043 const Xapian::valueno VAL_APT_PACKAGE_SIZE        =  2;
00044 const Xapian::valueno VAL_POPCON                  = 10;
00045 const Xapian::valueno VAL_ITERATING_RATING        = 20;
00046 const Xapian::valueno VAL_ITERATING_FUNCTIONALITY = 21;
00047 const Xapian::valueno VAL_ITERATING_USABILITY     = 22;
00048 const Xapian::valueno VAL_ITERATING_SECURITY      = 23;
00049 const Xapian::valueno VAL_ITERATING_PERFORMANCE   = 24;
00050 const Xapian::valueno VAL_ITERATING_QUALITY       = 25;
00051 const Xapian::valueno VAL_ITERATING_SUPPORT       = 26;
00052 const Xapian::valueno VAL_ITERATING_ADOPTION      = 27;
00053 // If you need to index a value and cannot edit this file, feel free to use any
00054 // value starting from 1000000
00055 
00056 
00057 /*
00058 Fallback on apt scan searches when index is not present
00059 
00060 Explicitly decide at instantiation (or at any other time) if a rebuild should
00061 be performed.  Just adding a 'rebuildIfNeeded' method would be enough.
00062 
00063 17:14 #xapian < enrico> Hello.  I'm finally in a position of writing a library to maintain
00064                         a xapian index with Debian package descriptions in a Debian system
00065 17:14 #xapian < enrico> I have a question, though
00066 17:14 #xapian < enrico> The descriptions change regularly as people run 'apt-get update'
00067 17:15 #xapian < enrico> I'd need to have a way to update the description index after
00068                         apt-get update, without rebuilding it from scratch
00069 17:15 #xapian < enrico> Is there some documentation on how to do that?  I can't exactly
00070                         tell Xapian "the new description for package foo is this" because
00071                         I'd need the xapian id
00072 19:11 #xapian < omega> you can add a unique term with a boolean prefix?
00073 19:11 #xapian < omega> like Qpackage-name
00074 19:11 #xapian < omega> then you search for it and replace_document
00075 19:24 #xapian < richardb> Or indeed, you use the "replace_document()" form which takes a
00076                           unique_id term.
00077 19:25 #xapian < richardb>         Xapian::docid replace_document(const std::string &
00078                           unique_term,
00079 19:25 #xapian < richardb>                                        const Xapian::Document &
00080                           document);
00081 19:43 #xapian < enrico> unique term
00082 19:43 #xapian < enrico> nice!
00083 19:44 #xapian < enrico> can I use a non-alpha prefix, like :package-name ?
00084 19:45 #xapian < enrico> or pkg:package-name
00085 19:45 #xapian < enrico> I suppose I can
00086 */
00087 
00102 class TextSearch
00103 {
00104 protected:
00105     time_t m_timestamp;
00106     Xapian::Database m_db;
00107     Xapian::Stem m_stem;
00108 
00110     static std::string toLower(const std::string& str);
00111 
00118     void normalize_and_add(Xapian::Document& doc, const std::string& term, int& pos) const;
00119 
00120 public:
00121     struct ExtraIndexer
00122     {
00123         virtual ~ExtraIndexer() {}
00124         virtual void operator()(Xapian::Document& doc, const apt::PackageRecord& rec) const = 0;
00125     };
00126 
00127     TextSearch();
00128 
00130     Xapian::Database& db() { return m_db; }
00131 
00133     const Xapian::Database& db() const { return m_db; }
00134 
00136     time_t timestamp() const { return m_timestamp; }
00137 
00139     bool hasData() const { return m_timestamp > 0; }
00140 
00142     bool needsRebuild(apt::Apt& apt);
00143 
00153     bool rebuildIfNeeded(
00154         apt::Apt& apt,
00155         const std::vector<const ExtraIndexer*>& extraIndexers = std::vector<const ExtraIndexer*>());
00156 
00160     Xapian::docid docidByName(const std::string& pkgname) const;
00161 
00165     Xapian::Query makeORQuery(const std::string& keywords) const;
00166 
00173     Xapian::Query makePartialORQuery(const std::string& keywords) const;
00174 
00178     template<typename ITER>
00179     Xapian::Query makeORQuery(const ITER& begin, const ITER& end) const
00180     {
00181         std::vector<std::string> terms;
00182         // Insert both the lowercased and the stemmed lowercased query terms
00183         for (ITER i = begin; i != end; ++i)
00184         {
00185             std::string t = toLower(*i);
00186             std::string s = m_stem(t);
00187             terms.push_back(t);
00188             if (s != t)
00189                 terms.push_back("Z" + s);
00190         }
00191         return Xapian::Query(Xapian::Query::OP_OR, terms.begin(), terms.end());
00192     }
00193 
00195     std::vector<std::string> expand(Xapian::Enquire& enq) const;
00196 
00197 //  std::vector<std::string> similar(const std::string& pkg);
00198 
00202     Xapian::Query makeRelatedQuery(const std::string& pkgname) const;
00203 
00207     double getDoubleValue(const std::string& pkgname, Xapian::valueno val_id) const;
00208 
00212     int getIntValue(const std::string& pkgname, Xapian::valueno val_id) const;
00213 };
00214 
00215 }
00216 }
00217 
00218 // vim:set ts=4 sw=4:
00219 #endif