7e4908385b47337f4a16d49bbd6c17c9b8652774
[Plinn.git] / catalog.py
1 # -*- coding: utf-8 -*-
2 from App.class_init import InitializeClass
3 from AccessControl import ClassSecurityInfo
4 from Products.CMFCore.interfaces import IIndexableObject
5 from Products.CMFCore.CatalogTool import CatalogTool as BaseCatalogTool
6 from Products.CMFCore.CatalogTool import IndexableObjectWrapper
7 from Products.PageTemplates.PageTemplateFile import PageTemplateFile
8 from Products.CMFCore.permissions import ModifyPortalContent, ManagePortal
9 from zope.component import queryMultiAdapter
10 from Products.ZCatalog.Catalog import Catalog
11 import transaction
12 from solr import *
13
14 # imports for Catalog class
15 from Products.PluginIndexes.interfaces import ILimitedResultIndex
16 from Products.ZCatalog.Lazy import LazyMap, LazyCat, LazyValues
17 from BTrees.IIBTree import intersection, IISet
18 from BTrees.IIBTree import weightedIntersection
19 import warnings
20
21 _VOLATILE_SOLR_NAME = '_v_solrConnection'
22
23 class SolrTransactionHook :
24 ''' commit solr couplé sur le commit de la ZODB '''
25 def __init__(self, context, con) :
26 self.context = context
27 self.con = con
28
29 def __call__(self, status) :
30 if status :
31 self.con.commit()
32 self.con.close()
33 else :
34 self.con.close()
35 try :
36 delattr(self.context, _VOLATILE_SOLR_NAME)
37 except AttributeError :
38 pass
39
40 class CatalogTool(BaseCatalogTool) :
41 meta_type = 'Plinn Catalog'
42 security = ClassSecurityInfo()
43 manage_options = (BaseCatalogTool.manage_options[:5] +
44 ({'label' : 'Solr', 'action' : 'manage_solr'},) +
45 BaseCatalogTool.manage_options[5:])
46 manage_solr = PageTemplateFile('www/manage_solr.pt', globals(), __name__='manage_solr')
47
48
49
50 def __init__(self, idxs=[]) :
51 super(CatalogTool, self).__init__()
52 self._catalog = DelegatedCatalog(self)
53 self.solr_url = 'http://localhost:8983/solr'
54 self.delegatedIndexes = ('Title', 'Description', 'SearchableText')
55
56 security.declarePublic('getDelegatedIndexes')
57 def getDelegatedIndexes(self) :
58 """ read the method name """
59 return self.delegatedIndexes
60
61 security.declareProtected(ManagePortal, 'setSolrProperties')
62 def setSolrProperties(self, url, indexes, REQUEST=None) :
63 """ set Solr server url and delegated indexes """
64 self.solr_url = url
65 self.delegatedIndexes = tuple([i.strip() for i in indexes if i.strip()])
66 if REQUEST :
67 REQUEST.RESPONSE.redirect(self.absolute_url() + '/manage_solr?manage_tabs_message=Saved changes.')
68
69 def _getSolrConnection(self) :
70 if not hasattr(self, _VOLATILE_SOLR_NAME) :
71 con = SolrConnection(self.solr_url)
72 setattr(self, _VOLATILE_SOLR_NAME, con)
73 txn = transaction.get()
74 txn.addAfterCommitHook(SolrTransactionHook(self, con))
75 return getattr(self, _VOLATILE_SOLR_NAME)
76
77 security.declarePrivate('solrAdd')
78 def solrAdd(self, w, uid, idxs) :
79 idxs = idxs if idxs else self.delegatedIndexes
80 # Filter out delegated indexes
81 idxs = [i for i in idxs if i in self.delegatedIndexes]
82 data = {'id' : uid}
83 for name in idxs :
84 attr = getattr(w, name, '')
85 data[name] = attr() if callable(attr) else attr
86 c = self._getSolrConnection()
87 c.add(**data)
88
89 # PortalCatalog api overloads
90 def catalog_object(self, obj, uid=None, idxs=None, update_metadata=1,
91 pghandler=None):
92 # Wraps the object with workflow and accessibility
93 # information just before cataloging.
94 if IIndexableObject.providedBy(obj):
95 w = obj
96 else:
97 w = queryMultiAdapter( (obj, self), IIndexableObject )
98 if w is None:
99 # BBB
100 w = IndexableObjectWrapper(obj, self)
101
102 idxs_ = idxs
103 if idxs:
104 # Filter out invalid indexes.
105 valid_indexes = self._catalog.indexes.keys()
106 idxs_ = [i for i in idxs if i in valid_indexes]
107
108 super(CatalogTool, self).catalog_object(w, uid, idxs_, update_metadata, pghandler)
109 self.solrAdd(w, uid, idxs)
110
111 security.declarePrivate('reindexObject')
112 def reindexObject(self, object, idxs=[], update_metadata=1, uid=None):
113 """Update catalog after object data has changed.
114
115 The optional idxs argument is a list of specific indexes
116 to update (all of them by default).
117
118 The update_metadata flag controls whether the object's
119 metadata record is updated as well.
120
121 If a non-None uid is passed, it will be used as the catalog uid
122 for the object instead of its physical path.
123 """
124 if uid is None:
125 uid = self.__url(object)
126
127 self.catalog_object(object, uid, idxs, update_metadata)
128
129 security.declarePrivate('unindexObject')
130 def unindexObject(self, object):
131 """Remove from catalog.
132 """
133 super(CatalogTool, self).unindexObject(object)
134 c = self._getSolrConnection()
135 url = self.__url(object)
136 c.delete(id=url)
137
138 InitializeClass(CatalogTool)
139
140
141 class DelegatedCatalog(Catalog) :
142 '''C'est ici qu'on délègue effectivement à Solr '''
143
144 def __init__(self, zcat, brains=None) :
145 Catalog.__init__(self, brains=brains)
146 self.zcat = zcat
147
148 def delegateSearch(self, query, plan) :
149 '''
150 retours faux :
151 None signifie : pas de délégation, il faut continuer à interroger les autres index.
152 IISet() vide : pas de résultat lors de la délégation, on peut arrêter la recherche.
153 '''
154 indexes = set(query.keys()).intersection(set(self.zcat.delegatedIndexes))
155 if not indexes :
156 return None
157 delegatedQuery = {}
158 for i in indexes :
159 delegatedQuery[i] = query.pop(i)
160 try : plan.remove(i)
161 except ValueError : pass
162 c = SolrConnection(self.zcat.solr_url)
163 q =' AND '.join(['%s:"%s"' % item for item in delegatedQuery.items()])
164 resp = c.query(q, fields='id', rows=len(self))
165 c.close()
166 return IISet(filter(None, [self.uids.get(r['id']) for r in resp.results]))
167
168 def search(self, query, sort_index=None, reverse=0, limit=None, merge=1):
169 """Iterate through the indexes, applying the query to each one. If
170 merge is true then return a lazy result set (sorted if appropriate)
171 otherwise return the raw (possibly scored) results for later merging.
172 Limit is used in conjuntion with sorting or scored results to inform
173 the catalog how many results you are really interested in. The catalog
174 can then use optimizations to save time and memory. The number of
175 results is not guaranteed to fall within the limit however, you should
176 still slice or batch the results as usual."""
177
178 rs = None # resultset
179
180 # Indexes fulfill a fairly large contract here. We hand each
181 # index the query mapping we are given (which may be composed
182 # of some combination of web request, kw mappings or plain old dicts)
183 # and the index decides what to do with it. If the index finds work
184 # for itself in the query, it returns the results and a tuple of
185 # the attributes that were used. If the index finds nothing for it
186 # to do then it returns None.
187
188 # Canonicalize the request into a sensible query before passing it on
189 query = self.make_query(query)
190
191 cr = self.getCatalogPlan(query)
192 cr.start()
193
194 plan = cr.plan()
195 if not plan:
196 plan = self._sorted_search_indexes(query)
197
198 # délégation
199 rs = self.delegateSearch(query, plan)
200 if rs is not None and not rs :
201 return LazyCat([])
202
203 indexes = self.indexes.keys()
204 for i in plan:
205 if i not in indexes:
206 # We can have bogus keys or the plan can contain index names
207 # that have been removed in the meantime
208 continue
209
210 index = self.getIndex(i)
211 _apply_index = getattr(index, "_apply_index", None)
212 if _apply_index is None:
213 continue
214
215 cr.start_split(i)
216 limit_result = ILimitedResultIndex.providedBy(index)
217 if limit_result:
218 r = _apply_index(query, rs)
219 else:
220 r = _apply_index(query)
221
222 if r is not None:
223 r, u = r
224 # Short circuit if empty result
225 # BBB: We can remove the "r is not None" check in Zope 2.14
226 # once we don't need to support the "return everything" case
227 # anymore
228 if r is not None and not r:
229 cr.stop_split(i, result=None, limit=limit_result)
230 return LazyCat([])
231
232 # provide detailed info about the pure intersection time
233 intersect_id = i + '#intersection'
234 cr.start_split(intersect_id)
235 # weightedIntersection preserves the values from any mappings
236 # we get, as some indexes don't return simple sets
237 if hasattr(rs, 'items') or hasattr(r, 'items'):
238 _, rs = weightedIntersection(rs, r)
239 else:
240 rs = intersection(rs, r)
241
242 cr.stop_split(intersect_id)
243
244 # consider the time it takes to intersect the index result with
245 # the total resultset to be part of the index time
246 cr.stop_split(i, result=r, limit=limit_result)
247 if not rs:
248 break
249 else:
250 cr.stop_split(i, result=None, limit=limit_result)
251
252 # Try to deduce the sort limit from batching arguments
253 b_start = int(query.get('b_start', 0))
254 b_size = query.get('b_size', None)
255 if b_size is not None:
256 b_size = int(b_size)
257
258 if b_size is not None:
259 limit = b_start + b_size
260 elif limit and b_size is None:
261 b_size = limit
262
263 if rs is None:
264 # None of the indexes found anything to do with the query
265 # We take this to mean that the query was empty (an empty filter)
266 # and so we return everything in the catalog
267 warnings.warn('Your query %s produced no query restriction. '
268 'Currently the entire catalog content is returned. '
269 'In Zope 2.14 this will result in an empty LazyCat '
270 'to be returned.' % repr(cr.make_key(query)),
271 DeprecationWarning, stacklevel=3)
272
273 rlen = len(self)
274 if sort_index is None:
275 sequence, slen = self._limit_sequence(self.data.items(), rlen,
276 b_start, b_size)
277 result = LazyMap(self.instantiate, sequence, slen,
278 actual_result_count=rlen)
279 else:
280 cr.start_split('sort_on')
281 result = self.sortResults(
282 self.data, sort_index, reverse, limit, merge,
283 actual_result_count=rlen, b_start=b_start,
284 b_size=b_size)
285 cr.stop_split('sort_on', None)
286 elif rs:
287 # We got some results from the indexes.
288 # Sort and convert to sequences.
289 # XXX: The check for 'values' is really stupid since we call
290 # items() and *not* values()
291 rlen = len(rs)
292 if sort_index is None and hasattr(rs, 'items'):
293 # having a 'items' means we have a data structure with
294 # scores. Build a new result set, sort it by score, reverse
295 # it, compute the normalized score, and Lazify it.
296
297 if not merge:
298 # Don't bother to sort here, return a list of
299 # three tuples to be passed later to mergeResults
300 # note that data_record_normalized_score_ cannot be
301 # calculated and will always be 1 in this case
302 getitem = self.__getitem__
303 result = [(score, (1, score, rid), getitem)
304 for rid, score in rs.items()]
305 else:
306 cr.start_split('sort_on')
307
308 rs = rs.byValue(0) # sort it by score
309 max = float(rs[0][0])
310
311 # Here we define our getter function inline so that
312 # we can conveniently store the max value as a default arg
313 # and make the normalized score computation lazy
314 def getScoredResult(item, max=max, self=self):
315 """
316 Returns instances of self._v_brains, or whatever is
317 passed into self.useBrains.
318 """
319 score, key = item
320 r=self._v_result_class(self.data[key])\
321 .__of__(aq_parent(self))
322 r.data_record_id_ = key
323 r.data_record_score_ = score
324 r.data_record_normalized_score_ = int(100. * score / max)
325 return r
326
327 sequence, slen = self._limit_sequence(rs, rlen, b_start,
328 b_size)
329 result = LazyMap(getScoredResult, sequence, slen,
330 actual_result_count=rlen)
331 cr.stop_split('sort_on', None)
332
333 elif sort_index is None and not hasattr(rs, 'values'):
334 # no scores
335 if hasattr(rs, 'keys'):
336 rs = rs.keys()
337 sequence, slen = self._limit_sequence(rs, rlen, b_start,
338 b_size)
339 result = LazyMap(self.__getitem__, sequence, slen,
340 actual_result_count=rlen)
341 else:
342 # sort. If there are scores, then this block is not
343 # reached, therefore 'sort-on' does not happen in the
344 # context of a text index query. This should probably
345 # sort by relevance first, then the 'sort-on' attribute.
346 cr.start_split('sort_on')
347 result = self.sortResults(rs, sort_index, reverse, limit,
348 merge, actual_result_count=rlen, b_start=b_start,
349 b_size=b_size)
350 cr.stop_split('sort_on', None)
351 else:
352 # Empty result set
353 result = LazyCat([])
354 cr.stop()
355 return result