libept
xapian.h
Go to the documentation of this file.
1 // -*- C++ -*-
2 #include <xapian.h>
3 #include <ept/core/apt.h>
4 #include <wibble/regexp.h>
5 #include <wibble/sys/pipe.h>
6 #include <wibble/sys/exec.h>
7 
8 #ifndef EPT_XAPIAN_H
9 #define EPT_XAPIAN_H
10 
11 namespace ept {
12 namespace core {
13 namespace xapian {
14 
15 // Allocate value indexes for known values
16 const Xapian::valueno VAL_APT_INSTALLED_SIZE = 1;
17 const Xapian::valueno VAL_APT_PACKAGE_SIZE = 2;
18 const Xapian::valueno VAL_POPCON = 10;
19 const Xapian::valueno VAL_ITERATING_RATING = 20;
20 const Xapian::valueno VAL_ITERATING_FUNCTIONALITY = 21;
21 const Xapian::valueno VAL_ITERATING_USABILITY = 22;
22 const Xapian::valueno VAL_ITERATING_SECURITY = 23;
23 const Xapian::valueno VAL_ITERATING_PERFORMANCE = 24;
24 const Xapian::valueno VAL_ITERATING_QUALITY = 25;
25 const Xapian::valueno VAL_ITERATING_SUPPORT = 26;
26 const Xapian::valueno VAL_ITERATING_ADOPTION = 27;
27 
28 struct TagFilter : public Xapian::ExpandDecider
29 {
30  virtual bool operator()(const std::string &term) const {
31  return term[0] == 'X' && term[1] == 'T';
32  }
33 };
34 
35 struct List {
36  char m_enqPlace[sizeof(Xapian::Enquire)];
37  mutable Xapian::MSet m_matches;
38  mutable Xapian::MSet::const_iterator m_iter;
39  mutable int m_pos;
40  typedef List Type;
41 
42  static const size_t chunkSize = 20;
43 
44  List head() const {
45  seek();
46  return *this;
47  }
48 
49  Token token() const {
50  Token t;
51  t._id = m_iter.get_document().get_data();
52  return t;
53  }
54 
55  bool operator<( const List &o ) const {
56  return token() < o.token();
57  }
58 
59  void seek() const {
60  if ( m_matches.size() == chunkSize && m_iter == m_matches.end() ) {
61  m_matches = enq().get_mset( m_pos, chunkSize );
62  m_iter = m_matches.begin();
63  m_pos += chunkSize;
64  }
65  }
66 
67  bool empty() const {
68  if ( m_pos == -1 )
69  return true;
70  seek();
71  return m_matches.size() < 30 && m_iter == m_matches.end();
72  }
73 
74  List tail() const {
75  List t = *this;
76  t.seek();
77  t.m_iter ++;
78  return t;
79  }
80 
81  Xapian::Enquire const &enq() const {
82  return *reinterpret_cast< Xapian::Enquire const * >( m_enqPlace );
83  }
84 
85  List( Xapian::Enquire _enq )
86  {
87  Xapian::Enquire *e = new (m_enqPlace) Xapian::Enquire( _enq );
88  assert_eq( e, &enq() );
89  m_matches = enq().get_mset( 0, chunkSize );
90  m_iter = m_matches.begin();
91  m_pos = chunkSize;
92  }
93 
94  List() {}
95 };
96 
97 struct Query {
98  Xapian::Database *m_db;
99  Xapian::Enquire m_enq;
100  Xapian::Stem m_stem;
101  typedef std::set< std::string > Terms;
103  int m_cutoff;
104  bool m_expand;
105 
106  void setQualityCutoff( int c ) {
107  m_cutoff = c;
108  }
109 
110  void setExpand( bool e ) { m_expand = e; }
111 
112  Query( Xapian::Database &e ) : m_db( &e ), m_enq( e ) {
113  m_cutoff = 50;
114  m_expand = true;
115  }
116 
117  wibble::Tokenizer queryTokenizer( std::string q ) const {
118  return wibble::Tokenizer( q, "[A-Za-z0-9._+:-]+", REG_EXTENDED );
119  }
120 
121  template< typename Out >
122  void tokenizeQuery( std::string q, Out o ) const
123  {
124  wibble::Tokenizer tok = queryTokenizer( q );
125  for (wibble::Tokenizer::const_iterator i = tok.begin(); i != tok.end(); ++i )
126  {
127  if ( (*i).find( "::" ) != std::string::npos ) { // assume tag
128  *o++ = ("XT" + *i);
129  } else {
130  std::string t = wibble::str::tolower(*i);
131  std::string s = m_stem(t);
132  *o++ = t;
133  if (s != t)
134  *o++ = ("Z" + s);
135  }
136  }
137  }
138 
139  template< typename Out >
140  void expand( Out o ) const
141  {
142  Xapian::RSet rset;
143  // Get the top 5 results as 'good ones' to compute the search expansion
144  Xapian::MSet mset = m_enq.get_mset(0, 5);
145  for ( Xapian::MSet::iterator i = mset.begin(); i != mset.end(); ++i )
146  rset.add_document(i);
147  // Get the expanded set, only expanding the query with tag names
148  TagFilter tagf;
149  Xapian::ESet eset = m_enq.get_eset(5, rset, &tagf);
150  for ( Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i )
151  *o++ = *i;
152  }
153 
154  void updateEnquire() {
155  // set up query now
156  Xapian::Query inc( Xapian::Query::OP_OR,
157  m_include.begin(),
158  m_include.end() ),
159  exc( Xapian::Query::OP_OR,
160  m_exclude.begin(),
161  m_exclude.end() ),
162  secondary( Xapian::Query::OP_OR,
163  m_secondary.begin(),
164  m_secondary.end() ),
165  secondary1( Xapian::Query::OP_SCALE_WEIGHT, secondary, 0.02 ),
166  query1( Xapian::Query::OP_AND_NOT, inc, exc ),
167  query( Xapian::Query::OP_OR, query1, secondary1 );
168 
169  m_enq.set_query( query );
170 
171  if ( m_expand ) {
172  m_expand = false;
173  expand( std::inserter( m_include, m_include.begin() ) );
174  updateEnquire();
175  m_expand = true;
176  return;
177  }
178 
179  Xapian::MSet first = m_enq.get_mset(0, 1, 0, 0, 0);
180  Xapian::MSetIterator ifirst = first.begin();
181  if ( ifirst != first.end() ) {
182  // Xapian::percent cutoff = ifirst.get_percent() * m_cutoff / 100;
183  // m_enq.set_cutoff(cutoff);
184  }
185  }
186 
188  updateEnquire();
189  return List( m_enq );
190  }
191 
192  std::map< std::string, int > relevantTags( int n = 30 ) {
193  updateEnquire();
194  std::map< std::string, int > relev;
195  Xapian::RSet rset;
196  Xapian::MSet mset = m_enq.get_mset(0, 100);
197  for ( Xapian::MSet::iterator i = mset.begin(); i != mset.end(); ++i )
198  rset.add_document(i);
199  // Get the expanded set, only expanding the query with tag names
200  TagFilter tagf;
201  Xapian::ESet eset = m_enq.get_eset(n, rset, &tagf);
202  for ( Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i )
203  relev.insert( relev.begin(),
204  std::make_pair(
205  std::string( *i, 2, std::string::npos ),
206  i.get_weight() ) );
207  return relev;
208  }
209 
210  void addTerms( std::string t, bool partial = false, bool exclude = false ) {
211  if ( t.empty() )
212  return;
213  Terms &to = exclude ? m_exclude : m_include;
214  std::vector< std::string > tok;
215  tokenizeQuery( t, std::back_inserter( tok ) );
216  if ( partial ) {
217  if ( tok.back().size() == 1 ) {
218  tok.pop_back();
219  } else {
220  std::copy(
221  m_db->allterms_begin( tok.back() ),
222  m_db->allterms_end( tok.back() ),
223  std::back_inserter( tok ) );
224  }
225  }
226  std::copy( tok.begin(), tok.end(), std::inserter( to, to.begin() ) );
227  }
228 
229  void addTerms( const Terms &t, bool exclude = false ) {
230  Terms &to = exclude ? m_exclude : m_include;
231  std::copy( t.begin(), t.end(), std::inserter( to, to.begin() ) );
232  }
233 
234  void addSecondaryTerm( const std::string &term, bool partial = false ) {
235  if ( partial ) {
236  std::copy(
237  m_db->allterms_begin( term ),
238  m_db->allterms_end( term ),
239  std::inserter( m_secondary, m_secondary.begin() ) );
240  } else {
241  m_include.insert( m_secondary.begin(), term );
242  }
243  }
244 
245 };
246 
247 struct Source
248 {
249 protected:
250  mutable Xapian::Database m_db;
251  Xapian::Stem m_stem;
252  mutable bool m_opened;
253 
255  static std::string toLower(const std::string& str);
256 
263  void normalize_and_add(Xapian::Document& doc, const std::string& term,
264  int& pos) const;
265 
266 public:
267  Source();
268 
270  Xapian::Database& db() {
271  open();
272  return m_db;
273  }
274 
276  const Xapian::Database& db() const {
277  open();
278  return m_db;
279  }
280 
281  void open() const;
282  void invalidate() {
283  m_db = Xapian::Database();
284  m_opened = false;
285  }
286 
288  time_t timestamp() const;
289 
290  void updateLeniently( AptDatabase &apt, OpProgress *op = 0 ) {
291  if (apt.timestamp() - timestamp() > 86400 * 8) // a little over a week
292  update( op );
293  }
294 
295  void update( OpProgress *op = 0 ) {
296  if ( !op )
297  op = new OpProgress();
298 
299  wibble::exception::AddContext _ctx( "Rebuilding Xapian database." );
300  int outfd;
301  std::string op_str;
302 
303  wibble::sys::Exec ex( "update-apt-xapian-index" );
304  ex.args.push_back( "--batch-mode" );
305  ex.searchInPath = true;
306  ex.forkAndRedirect( 0, &outfd, 0 );
307 
308  wibble::sys::Pipe monit( outfd );
309  while ( !monit.eof() ) {
310  std::string line = monit.nextLine();
311  if ( line.empty() ) {
312  usleep( 100000 );
313  continue;
314  }
315  std::cerr << "got : " << line << std::endl;
316  if ( wibble::str::startsWith( line, "begin: " ) ) {
317  op_str = std::string( line, 7, std::string::npos );
318  op->OverallProgress( 0, 100, 100, op_str );
319 
320  } else if ( wibble::str::startsWith( line, "done: " ) ) {
321  op->Done();
322  } else if ( wibble::str::startsWith( line, "progress: " ) ) {
323  wibble::ERegexp re( "progress: ([0-9]+)/([0-9]+)", 3 );
324  if ( re.match( line ) ) {
325  assert_eq( re[2], "100" );
326  op->OverallProgress( atoi( re[1].c_str() ), 100, 100, op_str );
327  }
328  }
329  }
330  ex.waitForSuccess();
331  invalidate();
332  }
333 
335  bool hasData() const { return timestamp() > 0; }
336 
337  Query query( const std::string &s,
338  bool expand = true,
339  int qualityCutoff = 50 )
340  {
341  Query q( db() );
342  q.setQualityCutoff( qualityCutoff );
343  q.setExpand( expand );
344  q.addTerms( s );
345  if ( s.length() > 2 )
346  q.addSecondaryTerm( "XP" + s, true );
347  return q;
348  }
349 
350  Query partialQuery( const std::string &s ) {
351  Query q( db() );
352  q.addTerms( s, true ); // partial
353  return q;
354  }
355 
357  // bool needsRebuild(apt::Apt& apt);
358 
359  Xapian::docid docidByName(const std::string& pkgname) const;
360 
364  Xapian::Query makeORQuery(const std::string& keywords) const;
365 
372  Xapian::Query makePartialORQuery(const std::string& keywords) const;
373 
377  template<typename ITER>
378  Xapian::Query makeORQuery(const ITER& begin, const ITER& end) const
379  {
380  return Xapian::Query(Xapian::Query::OP_OR, begin, end);
381  }
382 
384  std::vector<std::string> expand(Xapian::Enquire& enq) const;
385 
386 // std::vector<std::string> similar(const std::string& pkg);
387 
391  Xapian::Query makeRelatedQuery(const std::string& pkgname) const;
392 
396  double getDoubleValue(const std::string& pkgname,
397  Xapian::valueno val_id) const;
398 
402  int getIntValue(const std::string& pkgname, Xapian::valueno val_id) const;
403 };
404 
405 }
406 }
407 }
408 
409 #endif