catalog.py

   1 # -*- coding: utf-8 -*-
   2 from App.class_init import InitializeClass
   3 from AccessControl import ClassSecurityInfo
   4 from Products.CMFCore.interfaces import IIndexableObject
   5 from Products.CMFCore.CatalogTool import CatalogTool as BaseCatalogTool
   6 from Products.CMFCore.CatalogTool import IndexableObjectWrapper
   7 from Products.PageTemplates.PageTemplateFile import PageTemplateFile
   8 from Products.CMFCore.permissions import ModifyPortalContent, ManagePortal
   9 from zope.component import queryMultiAdapter
  10 from Products.ZCatalog.Catalog import Catalog
  11 import transaction
  12 from solr import *
  13
  14 # imports for Catalog class
  15 from Products.PluginIndexes.interfaces import ILimitedResultIndex
  16 from Products.ZCatalog.Lazy import LazyMap, LazyCat, LazyValues
  17 from BTrees.IIBTree import intersection, IISet
  18 from BTrees.IIBTree import weightedIntersection
  19 import warnings
  20
  21 _VOLATILE_SOLR_NAME = '_v_solrConnection'
  22
  23 class SolrTransactionHook :
  24     ''' commit solr couplé sur le commit de la ZODB '''
  25     def __init__(self, context, con) :
  26         self.context = context
  27         self.con = con
  28
  29     def __call__(self, status) :
  30         if status :
  31             self.con.commit()
  32             self.con.close()
  33         else :
  34             self.con.close()
  35         try :
  36             delattr(self.context, _VOLATILE_SOLR_NAME)
  37         except AttributeError :
  38             pass
  39
  40 class CatalogTool(BaseCatalogTool) :
  41     meta_type = 'Plinn Catalog'
  42     security = ClassSecurityInfo()
  43     manage_options = (BaseCatalogTool.manage_options[:5] +
  44                       ({'label' : 'Solr', 'action' : 'manage_solr'},) +
  45                       BaseCatalogTool.manage_options[5:])
  46     manage_solr = PageTemplateFile('www/manage_solr.pt', globals(), __name__='manage_solr')
  47
  48
  49
  50     def __init__(self, idxs=[]) :
  51         super(CatalogTool, self).__init__()
  52         self._catalog = DelegatedCatalog(self)
  53         self.solr_url = 'http://localhost:8983/solr'
  54         self.delegatedIndexes = ('Title', 'Description', 'SearchableText')
  55
  56     security.declarePublic('getDelegatedIndexes')
  57     def getDelegatedIndexes(self) :
  58         """ read the method name """
  59         return self.delegatedIndexes
  60
  61     security.declareProtected(ManagePortal, 'setDelegatedIndexes')
  62     def setDelegatedIndexes(self, indexes, REQUEST=None) :
  63         """setDelegatedIndexes documentation"""
  64         self.delegatedIndexes = tuple([i.strip() for i in indexes if i.strip()])
  65         if REQUEST :
  66             REQUEST.RESPONSE.redirect(self.absolute_url() + '/manage_solr?manage_tabs_message=Saved changes.')
  67
  68     def _getSolrConnection(self) :
  69         if not hasattr(self, _VOLATILE_SOLR_NAME) :
  70             con = SolrConnection(self.solr_url)
  71             setattr(self, _VOLATILE_SOLR_NAME, con)
  72             txn = transaction.get()
  73             txn.addAfterCommitHook(SolrTransactionHook(self, con))
  74         return getattr(self, _VOLATILE_SOLR_NAME)
  75
  76     security.declarePrivate('solrAdd')
  77     def solrAdd(self, object, idxs=[], uid=None) :
  78         if IIndexableObject.providedBy(object):
  79             w = object
  80         else:
  81             w = queryMultiAdapter( (object, self), IIndexableObject )
  82             if w is None:
  83                 # BBB
  84                 w = IndexableObjectWrapper(object, self)
  85
  86         uid = uid if uid else self.__url(object)
  87         idxs = idxs if idxs !=[] else self.delegatedIndexes
  88         data = {'id' : uid}
  89         for name in idxs :
  90             attr = getattr(w, name, '')
  91             data[name] = attr() if callable(attr) else attr
  92         c = self._getSolrConnection()
  93         c.add(**data)
  94
  95
  96     # PortalCatalog api overloads
  97     security.declareProtected(ModifyPortalContent, 'indexObject')
  98     def indexObject(self, object) :
  99         """ Add to catalog and send to Solr """
 100         super(CatalogTool, self).indexObject(object)
 101         self.solrAdd(object)
 102
 103     security.declarePrivate('reindexObject')
 104     def reindexObject(self, object, idxs=[], update_metadata=1, uid=None):
 105         super(CatalogTool, self).reindexObject(object,
 106                                                idxs=idxs,
 107                                                update_metadata=update_metadata,
 108                                                uid=uid)
 109         if idxs != []:
 110             # Filter out invalid indexes.
 111             valid_indexes = self._catalog.indexes.keys()
 112             idxs = [i for i in idxs if i in valid_indexes and i in self.delegatedIndexes]
 113         else :
 114             idxs = self.delegatedIndexes
 115
 116         if idxs :
 117             self.solrAdd(object, idxs=idxs, uid=uid)
 118
 119     security.declarePrivate('unindexObject')
 120     def unindexObject(self, object):
 121         """Remove from catalog.
 122         """
 123         super(CatalogTool, self).unindexObject(object)
 124         c = self._getSolrConnection()
 125         url = self.__url(object)
 126         c.delete(id=url)
 127
 128 InitializeClass(CatalogTool)
 129
 130
 131 class DelegatedCatalog(Catalog) :
 132     '''C'est ici qu'on délègue effectivement à Solr '''
 133
 134     def __init__(self, zcat, brains=None) :
 135         Catalog.__init__(self, brains=brains)
 136         self.zcat = zcat
 137
 138     def delegateSearch(self, query, plan) :
 139         '''
 140         retours faux :
 141         None signifie : pas de délégation, il faut continuer à interroger les autres index.
 142         IISet() vide : pas de résultat lors de la délégation, on peut arrêter la recherche.
 143         '''
 144         indexes = set(query.keys()).intersection(set(self.zcat.delegatedIndexes))
 145         if not indexes :
 146             return None
 147         delegatedQuery = {}
 148         for i in indexes :
 149             delegatedQuery[i] = query.pop(i)
 150             try : plan.remove(i)
 151             except ValueError : pass
 152         c = SolrConnection(self.zcat.solr_url)
 153         q =' AND '.join(['%s:"%s"' % item for item in delegatedQuery.items()])
 154         resp = c.query(q, fields='id', rows=len(self))
 155         c.close()
 156         return IISet(filter(None, [self.uids.get(r['id']) for r in resp.results]))
 157
 158     def search(self, query, sort_index=None, reverse=0, limit=None, merge=1):
 159         """Iterate through the indexes, applying the query to each one. If
 160         merge is true then return a lazy result set (sorted if appropriate)
 161         otherwise return the raw (possibly scored) results for later merging.
 162         Limit is used in conjuntion with sorting or scored results to inform
 163         the catalog how many results you are really interested in. The catalog
 164         can then use optimizations to save time and memory. The number of
 165         results is not guaranteed to fall within the limit however, you should
 166         still slice or batch the results as usual."""
 167
 168         rs = None # resultset
 169
 170         # Indexes fulfill a fairly large contract here. We hand each
 171         # index the query mapping we are given (which may be composed
 172         # of some combination of web request, kw mappings or plain old dicts)
 173         # and the index decides what to do with it. If the index finds work
 174         # for itself in the query, it returns the results and a tuple of
 175         # the attributes that were used. If the index finds nothing for it
 176         # to do then it returns None.
 177
 178         # Canonicalize the request into a sensible query before passing it on
 179         query = self.make_query(query)
 180
 181         cr = self.getCatalogPlan(query)
 182         cr.start()
 183
 184         plan = cr.plan()
 185         if not plan:
 186             plan = self._sorted_search_indexes(query)
 187
 188         # délégation
 189         rs = self.delegateSearch(query, plan)
 190         if rs is not None and not rs :
 191             return LazyCat([])
 192
 193         indexes = self.indexes.keys()
 194         for i in plan:
 195             if i not in indexes:
 196                 # We can have bogus keys or the plan can contain index names
 197                 # that have been removed in the meantime
 198                 continue
 199
 200             index = self.getIndex(i)
 201             _apply_index = getattr(index, "_apply_index", None)
 202             if _apply_index is None:
 203                 continue
 204
 205             cr.start_split(i)
 206             limit_result = ILimitedResultIndex.providedBy(index)
 207             if limit_result:
 208                 r = _apply_index(query, rs)
 209             else:
 210                 r = _apply_index(query)
 211
 212             if r is not None:
 213                 r, u = r
 214                 # Short circuit if empty result
 215                 # BBB: We can remove the "r is not None" check in Zope 2.14
 216                 # once we don't need to support the "return everything" case
 217                 # anymore
 218                 if r is not None and not r:
 219                     cr.stop_split(i, result=None, limit=limit_result)
 220                     return LazyCat([])
 221
 222                 # provide detailed info about the pure intersection time
 223                 intersect_id = i + '#intersection'
 224                 cr.start_split(intersect_id)
 225                 # weightedIntersection preserves the values from any mappings
 226                 # we get, as some indexes don't return simple sets
 227                 if hasattr(rs, 'items') or hasattr(r, 'items'):
 228                     _, rs = weightedIntersection(rs, r)
 229                 else:
 230                     rs = intersection(rs, r)
 231
 232                 cr.stop_split(intersect_id)
 233
 234                 # consider the time it takes to intersect the index result with
 235                 # the total resultset to be part of the index time
 236                 cr.stop_split(i, result=r, limit=limit_result)
 237                 if not rs:
 238                     break
 239             else:
 240                 cr.stop_split(i, result=None, limit=limit_result)
 241
 242         # Try to deduce the sort limit from batching arguments
 243         b_start = int(query.get('b_start', 0))
 244         b_size = query.get('b_size', None)
 245         if b_size is not None:
 246             b_size = int(b_size)
 247
 248         if b_size is not None:
 249             limit = b_start + b_size
 250         elif limit and b_size is None:
 251             b_size = limit
 252
 253         if rs is None:
 254             # None of the indexes found anything to do with the query
 255             # We take this to mean that the query was empty (an empty filter)
 256             # and so we return everything in the catalog
 257             warnings.warn('Your query %s produced no query restriction. '
 258                           'Currently the entire catalog content is returned. '
 259                           'In Zope 2.14 this will result in an empty LazyCat '
 260                           'to be returned.' % repr(cr.make_key(query)),
 261                           DeprecationWarning, stacklevel=3)
 262
 263             rlen = len(self)
 264             if sort_index is None:
 265                 sequence, slen = self._limit_sequence(self.data.items(), rlen,
 266                     b_start, b_size)
 267                 result = LazyMap(self.instantiate, sequence, slen,
 268                     actual_result_count=rlen)
 269             else:
 270                 cr.start_split('sort_on')
 271                 result = self.sortResults(
 272                     self.data, sort_index, reverse, limit, merge,
 273                         actual_result_count=rlen, b_start=b_start,
 274                         b_size=b_size)
 275                 cr.stop_split('sort_on', None)
 276         elif rs:
 277             # We got some results from the indexes.
 278             # Sort and convert to sequences.
 279             # XXX: The check for 'values' is really stupid since we call
 280             # items() and *not* values()
 281             rlen = len(rs)
 282             if sort_index is None and hasattr(rs, 'items'):
 283                 # having a 'items' means we have a data structure with
 284                 # scores.  Build a new result set, sort it by score, reverse
 285                 # it, compute the normalized score, and Lazify it.
 286
 287                 if not merge:
 288                     # Don't bother to sort here, return a list of
 289                     # three tuples to be passed later to mergeResults
 290                     # note that data_record_normalized_score_ cannot be
 291                     # calculated and will always be 1 in this case
 292                     getitem = self.__getitem__
 293                     result = [(score, (1, score, rid), getitem)
 294                             for rid, score in rs.items()]
 295                 else:
 296                     cr.start_split('sort_on')
 297
 298                     rs = rs.byValue(0) # sort it by score
 299                     max = float(rs[0][0])
 300
 301                     # Here we define our getter function inline so that
 302                     # we can conveniently store the max value as a default arg
 303                     # and make the normalized score computation lazy
 304                     def getScoredResult(item, max=max, self=self):
 305                         """
 306                         Returns instances of self._v_brains, or whatever is
 307                         passed into self.useBrains.
 308                         """
 309                         score, key = item
 310                         r=self._v_result_class(self.data[key])\
 311                               .__of__(aq_parent(self))
 312                         r.data_record_id_ = key
 313                         r.data_record_score_ = score
 314                         r.data_record_normalized_score_ = int(100. * score / max)
 315                         return r
 316
 317                     sequence, slen = self._limit_sequence(rs, rlen, b_start,
 318                         b_size)
 319                     result = LazyMap(getScoredResult, sequence, slen,
 320                         actual_result_count=rlen)
 321                     cr.stop_split('sort_on', None)
 322
 323             elif sort_index is None and not hasattr(rs, 'values'):
 324                 # no scores
 325                 if hasattr(rs, 'keys'):
 326                     rs = rs.keys()
 327                 sequence, slen = self._limit_sequence(rs, rlen, b_start,
 328                     b_size)
 329                 result = LazyMap(self.__getitem__, sequence, slen,
 330                     actual_result_count=rlen)
 331             else:
 332                 # sort.  If there are scores, then this block is not
 333                 # reached, therefore 'sort-on' does not happen in the
 334                 # context of a text index query.  This should probably
 335                 # sort by relevance first, then the 'sort-on' attribute.
 336                 cr.start_split('sort_on')
 337                 result = self.sortResults(rs, sort_index, reverse, limit,
 338                     merge, actual_result_count=rlen, b_start=b_start,
 339                     b_size=b_size)
 340                 cr.stop_split('sort_on', None)
 341         else:
 342             # Empty result set
 343             result = LazyCat([])
 344         cr.stop()
 345         return result