libept
xapian.h
Go to the documentation of this file.
00001 // -*- C++ -*-
00002 #include <xapian.h>
00003 #include <ept/core/apt.h>
00004 #include <wibble/regexp.h>
00005 #include <wibble/sys/pipe.h>
00006 #include <wibble/sys/exec.h>
00007 
00008 #ifndef EPT_XAPIAN_H
00009 #define EPT_XAPIAN_H
00010 
00011 namespace ept {
00012 namespace core {
00013 namespace xapian {
00014 
00015 // Allocate value indexes for known values
00016 const Xapian::valueno VAL_APT_INSTALLED_SIZE      =  1;
00017 const Xapian::valueno VAL_APT_PACKAGE_SIZE        =  2;
00018 const Xapian::valueno VAL_POPCON                  = 10;
00019 const Xapian::valueno VAL_ITERATING_RATING        = 20;
00020 const Xapian::valueno VAL_ITERATING_FUNCTIONALITY = 21;
00021 const Xapian::valueno VAL_ITERATING_USABILITY     = 22;
00022 const Xapian::valueno VAL_ITERATING_SECURITY      = 23;
00023 const Xapian::valueno VAL_ITERATING_PERFORMANCE   = 24;
00024 const Xapian::valueno VAL_ITERATING_QUALITY       = 25;
00025 const Xapian::valueno VAL_ITERATING_SUPPORT       = 26;
00026 const Xapian::valueno VAL_ITERATING_ADOPTION      = 27;
00027 
00028 struct TagFilter : public Xapian::ExpandDecider
00029 {
00030     virtual bool operator()(const std::string &term) const {
00031         return term[0] == 'X' && term[1] == 'T';
00032     }
00033 };
00034 
00035 struct List {
00036     char m_enqPlace[sizeof(Xapian::Enquire)];
00037     mutable Xapian::MSet m_matches;
00038     mutable Xapian::MSet::const_iterator m_iter;
00039     mutable int m_pos;
00040     typedef List Type;
00041 
00042     static const size_t chunkSize = 20;
00043 
00044     List head() const {
00045         seek();
00046         return *this;
00047     }
00048 
00049     Token token() const {
00050         Token t;
00051         t._id = m_iter.get_document().get_data();
00052         return t;
00053     }
00054 
00055     bool operator<( const List &o ) const {
00056         return token() < o.token();
00057     }
00058 
00059     void seek() const {
00060         if ( m_matches.size() == chunkSize && m_iter == m_matches.end() ) {
00061             m_matches = enq().get_mset( m_pos, chunkSize );
00062             m_iter = m_matches.begin();
00063             m_pos += chunkSize;
00064         }
00065     }
00066 
00067     bool empty() const {
00068         if ( m_pos == -1 )
00069             return true;
00070         seek();
00071         return m_matches.size() < 30 && m_iter == m_matches.end();
00072     }
00073 
00074     List tail() const {
00075         List t = *this;
00076         t.seek();
00077         t.m_iter ++;
00078         return t;
00079     }
00080 
00081     Xapian::Enquire const &enq() const {
00082         return *reinterpret_cast< Xapian::Enquire const * >( m_enqPlace );
00083     }
00084 
00085     List( Xapian::Enquire _enq )
00086     {
00087         Xapian::Enquire *e = new (m_enqPlace) Xapian::Enquire( _enq );
00088         assert_eq( e, &enq() );
00089         m_matches = enq().get_mset( 0, chunkSize );
00090         m_iter = m_matches.begin();
00091         m_pos = chunkSize;
00092     }
00093 
00094     List() {}
00095 };
00096 
00097 struct Query {
00098     Xapian::Database *m_db;
00099     Xapian::Enquire m_enq;
00100     Xapian::Stem m_stem;
00101     typedef std::set< std::string > Terms;
00102     Terms m_include, m_exclude, m_secondary;
00103     int m_cutoff;
00104     bool m_expand;
00105 
00106     void setQualityCutoff( int c ) {
00107         m_cutoff = c;
00108     }
00109 
00110     void setExpand( bool e ) { m_expand = e; }
00111 
00112     Query( Xapian::Database &e ) : m_db( &e ), m_enq( e ) {
00113         m_cutoff = 50;
00114         m_expand = true;
00115     }
00116 
00117     wibble::Tokenizer queryTokenizer( std::string q ) const {
00118         return wibble::Tokenizer( q, "[A-Za-z0-9._+:-]+", REG_EXTENDED );
00119     }
00120 
00121     template< typename Out >
00122     void tokenizeQuery( std::string q, Out o ) const
00123     {
00124         wibble::Tokenizer tok = queryTokenizer( q );
00125         for (wibble::Tokenizer::const_iterator i = tok.begin(); i != tok.end(); ++i )
00126         {
00127             if ( (*i).find( "::" ) != std::string::npos ) { // assume tag
00128                 *o++ = ("XT" + *i);
00129             } else {
00130                 std::string t = wibble::str::tolower(*i);
00131                 std::string s = m_stem(t);
00132                 *o++ = t;
00133                 if (s != t)
00134                     *o++ = ("Z" + s);
00135             }
00136         }
00137     }
00138 
00139     template< typename Out >
00140     void expand( Out o ) const
00141     {
00142         Xapian::RSet rset;
00143         // Get the top 5 results as 'good ones' to compute the search expansion
00144         Xapian::MSet mset = m_enq.get_mset(0, 5);
00145         for ( Xapian::MSet::iterator i = mset.begin(); i != mset.end(); ++i )
00146             rset.add_document(i);
00147         // Get the expanded set, only expanding the query with tag names
00148         TagFilter tagf;
00149         Xapian::ESet eset = m_enq.get_eset(5, rset, &tagf);
00150         for ( Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i )
00151             *o++ = *i;
00152     }
00153 
00154     void updateEnquire() {
00155         // set up query now
00156         Xapian::Query inc( Xapian::Query::OP_OR,
00157                            m_include.begin(),
00158                            m_include.end() ),
00159             exc( Xapian::Query::OP_OR,
00160                  m_exclude.begin(),
00161                  m_exclude.end() ),
00162             secondary( Xapian::Query::OP_OR,
00163                        m_secondary.begin(),
00164                        m_secondary.end() ),
00165             secondary1( Xapian::Query::OP_SCALE_WEIGHT, secondary, 0.02 ),
00166             query1( Xapian::Query::OP_AND_NOT, inc, exc ),
00167             query( Xapian::Query::OP_OR, query1, secondary1 );
00168 
00169         m_enq.set_query( query );
00170 
00171         if ( m_expand ) {
00172             m_expand = false;
00173             expand( std::inserter( m_include, m_include.begin() ) );
00174             updateEnquire();
00175             m_expand = true;
00176             return;
00177         }
00178 
00179         Xapian::MSet first = m_enq.get_mset(0, 1, 0, 0, 0);
00180         Xapian::MSetIterator ifirst = first.begin();
00181         if ( ifirst != first.end() ) {
00182             // Xapian::percent cutoff = ifirst.get_percent() * m_cutoff / 100;
00183             // m_enq.set_cutoff(cutoff);
00184         }
00185     }
00186 
00187     List results() {
00188         updateEnquire();
00189         return List( m_enq );
00190     }
00191 
00192     std::map< std::string, int > relevantTags( int n = 30 ) {
00193         updateEnquire();
00194         std::map< std::string, int > relev;
00195         Xapian::RSet rset;
00196         Xapian::MSet mset = m_enq.get_mset(0, 100);
00197         for ( Xapian::MSet::iterator i = mset.begin(); i != mset.end(); ++i )
00198             rset.add_document(i);
00199         // Get the expanded set, only expanding the query with tag names
00200         TagFilter tagf;
00201         Xapian::ESet eset = m_enq.get_eset(n, rset, &tagf);
00202         for ( Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i )
00203             relev.insert( relev.begin(),
00204                           std::make_pair(
00205                               std::string( *i, 2, std::string::npos ),
00206                               i.get_weight() ) );
00207         return relev;
00208     }
00209 
00210     void addTerms( std::string t, bool partial = false, bool exclude = false ) {
00211         if ( t.empty() )
00212             return;
00213         Terms &to = exclude ? m_exclude : m_include;
00214         std::vector< std::string > tok;
00215         tokenizeQuery( t, std::back_inserter( tok ) );
00216         if ( partial ) {
00217             if ( tok.back().size() == 1 ) {
00218                 tok.pop_back();
00219             } else {
00220                 std::copy(
00221                     m_db->allterms_begin( tok.back() ),
00222                     m_db->allterms_end( tok.back() ),
00223                     std::back_inserter( tok ) );
00224             }
00225         }
00226         std::copy( tok.begin(), tok.end(), std::inserter( to, to.begin() ) );
00227     }
00228 
00229     void addTerms( const Terms &t, bool exclude = false ) {
00230         Terms &to = exclude ? m_exclude : m_include;
00231         std::copy( t.begin(), t.end(), std::inserter( to, to.begin() ) );
00232     }
00233 
00234     void addSecondaryTerm( const std::string &term, bool partial = false ) {
00235         if ( partial ) {
00236             std::copy(
00237                 m_db->allterms_begin( term ),
00238                 m_db->allterms_end( term ),
00239                 std::inserter( m_secondary, m_secondary.begin() ) );
00240         } else {
00241             m_include.insert( m_secondary.begin(), term );
00242         }
00243     }
00244 
00245 };
00246 
00247 struct Source
00248 {
00249 protected:
00250     mutable Xapian::Database m_db;
00251     Xapian::Stem m_stem;
00252     mutable bool m_opened;
00253 
00255     static std::string toLower(const std::string& str);
00256 
00263     void normalize_and_add(Xapian::Document& doc, const std::string& term,
00264                            int& pos) const;
00265 
00266 public:
00267     Source();
00268 
00270     Xapian::Database& db() {
00271         open();
00272         return m_db;
00273     }
00274 
00276     const Xapian::Database& db() const {
00277         open();
00278         return m_db;
00279     }
00280 
00281     void open() const;
00282     void invalidate() {
00283         m_db = Xapian::Database();
00284         m_opened = false;
00285     }
00286 
00288     time_t timestamp() const;
00289 
00290     void updateLeniently( AptDatabase &apt, OpProgress *op = 0 ) {
00291         if (apt.timestamp() - timestamp() > 86400 * 8) // a little over a week
00292             update( op );
00293     }
00294 
00295     void update( OpProgress *op = 0 ) {
00296         if ( !op )
00297             op = new OpProgress();
00298 
00299         wibble::exception::AddContext _ctx( "Rebuilding Xapian database." );
00300         int outfd;
00301         std::string op_str;
00302 
00303         wibble::sys::Exec ex( "update-apt-xapian-index" );
00304         ex.args.push_back( "--batch-mode" );
00305         ex.searchInPath = true;
00306         ex.forkAndRedirect( 0, &outfd, 0 );
00307 
00308         wibble::sys::Pipe monit( outfd );
00309         while ( !monit.eof() ) {
00310             std::string line = monit.nextLine();
00311             if ( line.empty() ) {
00312                 usleep( 100000 );
00313                 continue;
00314             }
00315             std::cerr << "got : " << line << std::endl;
00316             if ( wibble::str::startsWith( line, "begin: " ) ) {
00317                 op_str = std::string( line, 7, std::string::npos );
00318                 op->OverallProgress( 0, 100, 100, op_str );
00319                     
00320             } else if ( wibble::str::startsWith( line, "done: " ) ) {
00321                 op->Done();
00322             } else if ( wibble::str::startsWith( line, "progress: " ) ) {
00323                 wibble::ERegexp re( "progress: ([0-9]+)/([0-9]+)", 3 );
00324                 if ( re.match( line ) ) {
00325                     assert_eq( re[2], "100" );
00326                     op->OverallProgress( atoi( re[1].c_str() ), 100, 100, op_str );
00327                 }
00328             }
00329         }
00330         ex.waitForSuccess();
00331         invalidate();
00332     }
00333 
00335     bool hasData() const { return timestamp() > 0; }
00336 
00337     Query query( const std::string &s,
00338                  bool expand = true,
00339                  int qualityCutoff = 50 )
00340     {
00341         Query q( db() );
00342         q.setQualityCutoff( qualityCutoff );
00343         q.setExpand( expand );
00344         q.addTerms( s );
00345         if ( s.length() > 2 )
00346             q.addSecondaryTerm( "XP" + s, true );
00347         return q;
00348     }
00349 
00350     Query partialQuery( const std::string &s ) {
00351         Query q( db() );
00352         q.addTerms( s, true ); // partial
00353         return q;
00354     }
00355 
00357     // bool needsRebuild(apt::Apt& apt);
00358 
00359     Xapian::docid docidByName(const std::string& pkgname) const;
00360 
00364     Xapian::Query makeORQuery(const std::string& keywords) const;
00365 
00372     Xapian::Query makePartialORQuery(const std::string& keywords) const;
00373 
00377     template<typename ITER>
00378     Xapian::Query makeORQuery(const ITER& begin, const ITER& end) const
00379     {
00380         return Xapian::Query(Xapian::Query::OP_OR, begin, end);
00381     }
00382 
00384     std::vector<std::string> expand(Xapian::Enquire& enq) const;
00385 
00386 //  std::vector<std::string> similar(const std::string& pkg);
00387 
00391     Xapian::Query makeRelatedQuery(const std::string& pkgname) const;
00392 
00396     double getDoubleValue(const std::string& pkgname,
00397                           Xapian::valueno val_id) const;
00398 
00402     int getIntValue(const std::string& pkgname, Xapian::valueno val_id) const;
00403 };
00404 
00405 }
00406 }
00407 }
00408 
00409 #endif