catalog.py

   1 # -*- coding: utf-8 -*-
   2 from App.class_init import InitializeClass
   3 from AccessControl import ClassSecurityInfo
   4 from Products.CMFCore.interfaces import IIndexableObject
   5 from Products.CMFCore.CatalogTool import CatalogTool as BaseCatalogTool
   6 from Products.CMFCore.CatalogTool import IndexableObjectWrapper
   7 from Products.PageTemplates.PageTemplateFile import PageTemplateFile
   8 from Products.CMFCore.permissions import ModifyPortalContent
   9 from zope.component import queryMultiAdapter
  10 from Products.ZCatalog.Catalog import Catalog
  11 import transaction
  12 from solr import *
  13
  14 # imports for Catalog class
  15 from Products.PluginIndexes.interfaces import ILimitedResultIndex
  16 from Products.ZCatalog.Lazy import LazyMap, LazyCat, LazyValues
  17 from BTrees.IIBTree import intersection, IISet
  18 from BTrees.IIBTree import weightedIntersection
  19 import warnings
  20
  21 class SolrTransactionHook :
  22     ''' commit solr couplé sur le commit de la ZODB '''
  23     def __init__(self, connection) :
  24         self.connection = connection
  25
  26     def __call__(self, status) :
  27         if status :
  28             self.connection.commit()
  29             self.connection.close()
  30         else :
  31             self.connection.close()
  32
  33 class CatalogTool(BaseCatalogTool) :
  34     meta_type = 'Legivoc Catalog'
  35     security = ClassSecurityInfo()
  36     manage_options = (BaseCatalogTool.manage_options[:5] +
  37                       ({'label' : 'Solr', 'action' : 'manage_solr'},) +
  38                       BaseCatalogTool.manage_options[5:])
  39     manage_solr = PageTemplateFile('www/manage_solr', globals())
  40
  41
  42     def __init__(self, idxs=[]) :
  43         super(CatalogTool, self).__init__()
  44         self._catalog = DelegatedCatalog()
  45         self.solr_url = 'http://localhost:8983/solr'
  46         self.delegatedIndexes = ('Title', 'Description', 'SearchableText')
  47
  48     security.declarePrivate('solrAdd')
  49     def solrAdd(self, object, idxs=[], uid=None) :
  50         if IIndexableObject.providedBy(object):
  51             w = object
  52         else:
  53             w = queryMultiAdapter( (object, self), IIndexableObject )
  54             if w is None:
  55                 # BBB
  56                 w = IndexableObjectWrapper(object, self)
  57
  58         uid = uid if uid else self.__url(object)
  59         idxs = idxs if idxs !=[] else self.delegatedIndexes
  60         data = {'id' : uid}
  61         for name in idxs :
  62             attr = getattr(w, name, '')
  63             data[name] = attr() if callable(attr) else attr
  64         c = SolrConnection(self.solr_url)
  65         c.add(**data)
  66         txn = transaction.get()
  67         txn.addAfterCommitHook(SolrTransactionHook(c))
  68
  69
  70     # PortalCatalog api overloads
  71     security.declareProtected(ModifyPortalContent, 'indexObject')
  72     def indexObject(self, object) :
  73         """ Add to catalog and send to Solr """
  74         super(CatalogTool, self).indexObject(object)
  75         self.solrAdd(object)
  76
  77     security.declarePrivate('reindexObject')
  78     def reindexObject(self, object, idxs=[], update_metadata=1, uid=None):
  79         super(CatalogTool, self).reindexObject(object,
  80                                                idxs=idxs,
  81                                                update_metadata=update_metadata,
  82                                                uid=uid)
  83         if idxs != []:
  84             # Filter out invalid indexes.
  85             valid_indexes = self._catalog.indexes.keys()
  86             idxs = [i for i in idxs if i in valid_indexes and i in self.delegatedIndexes]
  87         else :
  88             idxs = self.delegatedIndexes
  89
  90         if idxs :
  91             self.solrAdd(object, idxs=idxs, uid=uid)
  92
  93     security.declarePrivate('unindexObject')
  94     def unindexObject(self, object):
  95         """Remove from catalog.
  96         """
  97         super(CatalogTool, self).unindexObject(object)
  98         c = SolrConnection(self.solr_url)
  99         url = self.__url(object)
 100         c.delete(id=url)
 101         txn = transaction.get()
 102         txn.addAfterCommitHook(SolrTransactionHook(c))
 103
 104 InitializeClass(CatalogTool)
 105
 106
 107 class DelegatedCatalog(Catalog) :
 108     '''C'est ici qu'on délègue effectivement à Solr '''
 109
 110     def search(self, query, sort_index=None, reverse=0, limit=None, merge=1):
 111         """Iterate through the indexes, applying the query to each one. If
 112         merge is true then return a lazy result set (sorted if appropriate)
 113         otherwise return the raw (possibly scored) results for later merging.
 114         Limit is used in conjuntion with sorting or scored results to inform
 115         the catalog how many results you are really interested in. The catalog
 116         can then use optimizations to save time and memory. The number of
 117         results is not guaranteed to fall within the limit however, you should
 118         still slice or batch the results as usual."""
 119
 120         rs = None # resultset
 121
 122         # Indexes fulfill a fairly large contract here. We hand each
 123         # index the query mapping we are given (which may be composed
 124         # of some combination of web request, kw mappings or plain old dicts)
 125         # and the index decides what to do with it. If the index finds work
 126         # for itself in the query, it returns the results and a tuple of
 127         # the attributes that were used. If the index finds nothing for it
 128         # to do then it returns None.
 129
 130         # Canonicalize the request into a sensible query before passing it on
 131         query = self.make_query(query)
 132
 133         cr = self.getCatalogPlan(query)
 134         cr.start()
 135
 136         plan = cr.plan()
 137         if not plan:
 138             plan = self._sorted_search_indexes(query)
 139
 140         indexes = self.indexes.keys()
 141         for i in plan:
 142             if i not in indexes:
 143                 # We can have bogus keys or the plan can contain index names
 144                 # that have been removed in the meantime
 145                 continue
 146
 147             index = self.getIndex(i)
 148             _apply_index = getattr(index, "_apply_index", None)
 149             if _apply_index is None:
 150                 continue
 151
 152             cr.start_split(i)
 153             limit_result = ILimitedResultIndex.providedBy(index)
 154             if limit_result:
 155                 r = _apply_index(query, rs)
 156             else:
 157                 r = _apply_index(query)
 158
 159             if r is not None:
 160                 r, u = r
 161                 # Short circuit if empty result
 162                 # BBB: We can remove the "r is not None" check in Zope 2.14
 163                 # once we don't need to support the "return everything" case
 164                 # anymore
 165                 if r is not None and not r:
 166                     cr.stop_split(i, result=None, limit=limit_result)
 167                     return LazyCat([])
 168
 169                 # provide detailed info about the pure intersection time
 170                 intersect_id = i + '#intersection'
 171                 cr.start_split(intersect_id)
 172                 # weightedIntersection preserves the values from any mappings
 173                 # we get, as some indexes don't return simple sets
 174                 if hasattr(rs, 'items') or hasattr(r, 'items'):
 175                     _, rs = weightedIntersection(rs, r)
 176                 else:
 177                     rs = intersection(rs, r)
 178
 179                 cr.stop_split(intersect_id)
 180
 181                 # consider the time it takes to intersect the index result with
 182                 # the total resultset to be part of the index time
 183                 cr.stop_split(i, result=r, limit=limit_result)
 184                 if not rs:
 185                     break
 186             else:
 187                 cr.stop_split(i, result=None, limit=limit_result)
 188
 189         # Try to deduce the sort limit from batching arguments
 190         b_start = int(query.get('b_start', 0))
 191         b_size = query.get('b_size', None)
 192         if b_size is not None:
 193             b_size = int(b_size)
 194
 195         if b_size is not None:
 196             limit = b_start + b_size
 197         elif limit and b_size is None:
 198             b_size = limit
 199
 200         if rs is None:
 201             # None of the indexes found anything to do with the query
 202             # We take this to mean that the query was empty (an empty filter)
 203             # and so we return everything in the catalog
 204             warnings.warn('Your query %s produced no query restriction. '
 205                           'Currently the entire catalog content is returned. '
 206                           'In Zope 2.14 this will result in an empty LazyCat '
 207                           'to be returned.' % repr(cr.make_key(query)),
 208                           DeprecationWarning, stacklevel=3)
 209
 210             rlen = len(self)
 211             if sort_index is None:
 212                 sequence, slen = self._limit_sequence(self.data.items(), rlen,
 213                     b_start, b_size)
 214                 result = LazyMap(self.instantiate, sequence, slen,
 215                     actual_result_count=rlen)
 216             else:
 217                 cr.start_split('sort_on')
 218                 result = self.sortResults(
 219                     self.data, sort_index, reverse, limit, merge,
 220                         actual_result_count=rlen, b_start=b_start,
 221                         b_size=b_size)
 222                 cr.stop_split('sort_on', None)
 223         elif rs:
 224             # We got some results from the indexes.
 225             # Sort and convert to sequences.
 226             # XXX: The check for 'values' is really stupid since we call
 227             # items() and *not* values()
 228             rlen = len(rs)
 229             if sort_index is None and hasattr(rs, 'items'):
 230                 # having a 'items' means we have a data structure with
 231                 # scores.  Build a new result set, sort it by score, reverse
 232                 # it, compute the normalized score, and Lazify it.
 233
 234                 if not merge:
 235                     # Don't bother to sort here, return a list of
 236                     # three tuples to be passed later to mergeResults
 237                     # note that data_record_normalized_score_ cannot be
 238                     # calculated and will always be 1 in this case
 239                     getitem = self.__getitem__
 240                     result = [(score, (1, score, rid), getitem)
 241                             for rid, score in rs.items()]
 242                 else:
 243                     cr.start_split('sort_on')
 244
 245                     rs = rs.byValue(0) # sort it by score
 246                     max = float(rs[0][0])
 247
 248                     # Here we define our getter function inline so that
 249                     # we can conveniently store the max value as a default arg
 250                     # and make the normalized score computation lazy
 251                     def getScoredResult(item, max=max, self=self):
 252                         """
 253                         Returns instances of self._v_brains, or whatever is
 254                         passed into self.useBrains.
 255                         """
 256                         score, key = item
 257                         r=self._v_result_class(self.data[key])\
 258                               .__of__(aq_parent(self))
 259                         r.data_record_id_ = key
 260                         r.data_record_score_ = score
 261                         r.data_record_normalized_score_ = int(100. * score / max)
 262                         return r
 263
 264                     sequence, slen = self._limit_sequence(rs, rlen, b_start,
 265                         b_size)
 266                     result = LazyMap(getScoredResult, sequence, slen,
 267                         actual_result_count=rlen)
 268                     cr.stop_split('sort_on', None)
 269
 270             elif sort_index is None and not hasattr(rs, 'values'):
 271                 # no scores
 272                 if hasattr(rs, 'keys'):
 273                     rs = rs.keys()
 274                 sequence, slen = self._limit_sequence(rs, rlen, b_start,
 275                     b_size)
 276                 result = LazyMap(self.__getitem__, sequence, slen,
 277                     actual_result_count=rlen)
 278             else:
 279                 # sort.  If there are scores, then this block is not
 280                 # reached, therefore 'sort-on' does not happen in the
 281                 # context of a text index query.  This should probably
 282                 # sort by relevance first, then the 'sort-on' attribute.
 283                 cr.start_split('sort_on')
 284                 result = self.sortResults(rs, sort_index, reverse, limit,
 285                     merge, actual_result_count=rlen, b_start=b_start,
 286                     b_size=b_size)
 287                 cr.stop_split('sort_on', None)
 288         else:
 289             # Empty result set
 290             result = LazyCat([])
 291         cr.stop()
 292         return result