libept
|
00001 #ifndef EPT_TEXTSEARCH_TEXTSEARCH_H 00002 #define EPT_TEXTSEARCH_TEXTSEARCH_H 00003 00009 /* 00010 * Copyright (C) 2007 Enrico Zini <enrico@debian.org> 00011 * 00012 * This program is free software; you can redistribute it and/or modify 00013 * it under the terms of the GNU General Public License as published by 00014 * the Free Software Foundation; either version 2 of the License, or 00015 * (at your option) any later version. 00016 * 00017 * This program is distributed in the hope that it will be useful, 00018 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00019 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00020 * GNU General Public License for more details. 00021 * 00022 * You should have received a copy of the GNU General Public License 00023 * along with this program; if not, write to the Free Software 00024 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 00025 */ 00026 00027 #include <xapian.h> 00028 #include <vector> 00029 #include <string> 00030 00031 namespace ept { 00032 namespace apt { 00033 class Apt; 00034 class PackageRecord; 00035 } 00036 namespace debtags { 00037 class Debtags; 00038 } 00039 namespace textsearch { 00040 00041 // Allocate value indexes for known values 00042 const Xapian::valueno VAL_APT_INSTALLED_SIZE = 1; 00043 const Xapian::valueno VAL_APT_PACKAGE_SIZE = 2; 00044 const Xapian::valueno VAL_POPCON = 10; 00045 const Xapian::valueno VAL_ITERATING_RATING = 20; 00046 const Xapian::valueno VAL_ITERATING_FUNCTIONALITY = 21; 00047 const Xapian::valueno VAL_ITERATING_USABILITY = 22; 00048 const Xapian::valueno VAL_ITERATING_SECURITY = 23; 00049 const Xapian::valueno VAL_ITERATING_PERFORMANCE = 24; 00050 const Xapian::valueno VAL_ITERATING_QUALITY = 25; 00051 const Xapian::valueno VAL_ITERATING_SUPPORT = 26; 00052 const Xapian::valueno VAL_ITERATING_ADOPTION = 27; 00053 // If you need to index a value and cannot edit this file, feel free to use any 00054 // value starting from 1000000 00055 00056 00057 /* 00058 Fallback on apt scan searches when index is not present 00059 00060 Explicitly decide at instantiation (or at any other time) if a rebuild should 00061 be performed. Just adding a 'rebuildIfNeeded' method would be enough. 00062 00063 17:14 #xapian < enrico> Hello. I'm finally in a position of writing a library to maintain 00064 a xapian index with Debian package descriptions in a Debian system 00065 17:14 #xapian < enrico> I have a question, though 00066 17:14 #xapian < enrico> The descriptions change regularly as people run 'apt-get update' 00067 17:15 #xapian < enrico> I'd need to have a way to update the description index after 00068 apt-get update, without rebuilding it from scratch 00069 17:15 #xapian < enrico> Is there some documentation on how to do that? I can't exactly 00070 tell Xapian "the new description for package foo is this" because 00071 I'd need the xapian id 00072 19:11 #xapian < omega> you can add a unique term with a boolean prefix? 00073 19:11 #xapian < omega> like Qpackage-name 00074 19:11 #xapian < omega> then you search for it and replace_document 00075 19:24 #xapian < richardb> Or indeed, you use the "replace_document()" form which takes a 00076 unique_id term. 00077 19:25 #xapian < richardb> Xapian::docid replace_document(const std::string & 00078 unique_term, 00079 19:25 #xapian < richardb> const Xapian::Document & 00080 document); 00081 19:43 #xapian < enrico> unique term 00082 19:43 #xapian < enrico> nice! 00083 19:44 #xapian < enrico> can I use a non-alpha prefix, like :package-name ? 00084 19:45 #xapian < enrico> or pkg:package-name 00085 19:45 #xapian < enrico> I suppose I can 00086 */ 00087 00102 class TextSearch 00103 { 00104 protected: 00105 time_t m_timestamp; 00106 Xapian::Database m_db; 00107 Xapian::Stem m_stem; 00108 00110 static std::string toLower(const std::string& str); 00111 00118 void normalize_and_add(Xapian::Document& doc, const std::string& term, int& pos) const; 00119 00120 public: 00121 struct ExtraIndexer 00122 { 00123 virtual ~ExtraIndexer() {} 00124 virtual void operator()(Xapian::Document& doc, const apt::PackageRecord& rec) const = 0; 00125 }; 00126 00127 TextSearch(); 00128 00130 Xapian::Database& db() { return m_db; } 00131 00133 const Xapian::Database& db() const { return m_db; } 00134 00136 time_t timestamp() const { return m_timestamp; } 00137 00139 bool hasData() const { return m_timestamp > 0; } 00140 00142 bool needsRebuild(apt::Apt& apt); 00143 00153 bool rebuildIfNeeded( 00154 apt::Apt& apt, 00155 const std::vector<const ExtraIndexer*>& extraIndexers = std::vector<const ExtraIndexer*>()); 00156 00160 Xapian::docid docidByName(const std::string& pkgname) const; 00161 00165 Xapian::Query makeORQuery(const std::string& keywords) const; 00166 00173 Xapian::Query makePartialORQuery(const std::string& keywords) const; 00174 00178 template<typename ITER> 00179 Xapian::Query makeORQuery(const ITER& begin, const ITER& end) const 00180 { 00181 std::vector<std::string> terms; 00182 // Insert both the lowercased and the stemmed lowercased query terms 00183 for (ITER i = begin; i != end; ++i) 00184 { 00185 std::string t = toLower(*i); 00186 std::string s = m_stem(t); 00187 terms.push_back(t); 00188 if (s != t) 00189 terms.push_back("Z" + s); 00190 } 00191 return Xapian::Query(Xapian::Query::OP_OR, terms.begin(), terms.end()); 00192 } 00193 00195 std::vector<std::string> expand(Xapian::Enquire& enq) const; 00196 00197 // std::vector<std::string> similar(const std::string& pkg); 00198 00202 Xapian::Query makeRelatedQuery(const std::string& pkgname) const; 00203 00207 double getDoubleValue(const std::string& pkgname, Xapian::valueno val_id) const; 00208 00212 int getIntValue(const std::string& pkgname, Xapian::valueno val_id) const; 00213 }; 00214 00215 } 00216 } 00217 00218 // vim:set ts=4 sw=4: 00219 #endif