Solr doit tout retourner (pour l'instant).
[Plinn.git] / catalog.py
1 # -*- coding: utf-8 -*-
2 from App.class_init import InitializeClass
3 from AccessControl import ClassSecurityInfo
4 from Products.CMFCore.interfaces import IIndexableObject
5 from Products.CMFCore.CatalogTool import CatalogTool as BaseCatalogTool
6 from Products.CMFCore.CatalogTool import IndexableObjectWrapper
7 from Products.PageTemplates.PageTemplateFile import PageTemplateFile
8 from Products.CMFCore.permissions import ModifyPortalContent
9 from zope.component import queryMultiAdapter
10 from Products.ZCatalog.Catalog import Catalog
11 import transaction
12 from solr import *
13
14 # imports for Catalog class
15 from Products.PluginIndexes.interfaces import ILimitedResultIndex
16 from Products.ZCatalog.Lazy import LazyMap, LazyCat, LazyValues
17 from BTrees.IIBTree import intersection, IISet
18 from BTrees.IIBTree import weightedIntersection
19 import warnings
20
21 class SolrTransactionHook :
22 ''' commit solr couplé sur le commit de la ZODB '''
23 def __init__(self, connection) :
24 self.connection = connection
25
26 def __call__(self, status) :
27 if status :
28 self.connection.commit()
29 self.connection.close()
30 else :
31 self.connection.close()
32
33 class CatalogTool(BaseCatalogTool) :
34 meta_type = 'Legivoc Catalog'
35 security = ClassSecurityInfo()
36 manage_options = (BaseCatalogTool.manage_options[:5] +
37 ({'label' : 'Solr', 'action' : 'manage_solr'},) +
38 BaseCatalogTool.manage_options[5:])
39 manage_solr = PageTemplateFile('www/manage_solr', globals())
40
41
42 def __init__(self, idxs=[]) :
43 super(CatalogTool, self).__init__()
44 self._catalog = DelegatedCatalog(self)
45 self.solr_url = 'http://localhost:8983/solr'
46 self.delegatedIndexes = ('Title', 'Description', 'SearchableText')
47
48 security.declarePrivate('solrAdd')
49 def solrAdd(self, object, idxs=[], uid=None) :
50 if IIndexableObject.providedBy(object):
51 w = object
52 else:
53 w = queryMultiAdapter( (object, self), IIndexableObject )
54 if w is None:
55 # BBB
56 w = IndexableObjectWrapper(object, self)
57
58 uid = uid if uid else self.__url(object)
59 idxs = idxs if idxs !=[] else self.delegatedIndexes
60 data = {'id' : uid}
61 for name in idxs :
62 attr = getattr(w, name, '')
63 data[name] = attr() if callable(attr) else attr
64 c = SolrConnection(self.solr_url)
65 c.add(**data)
66 txn = transaction.get()
67 txn.addAfterCommitHook(SolrTransactionHook(c))
68
69
70 # PortalCatalog api overloads
71 security.declareProtected(ModifyPortalContent, 'indexObject')
72 def indexObject(self, object) :
73 """ Add to catalog and send to Solr """
74 super(CatalogTool, self).indexObject(object)
75 self.solrAdd(object)
76
77 security.declarePrivate('reindexObject')
78 def reindexObject(self, object, idxs=[], update_metadata=1, uid=None):
79 super(CatalogTool, self).reindexObject(object,
80 idxs=idxs,
81 update_metadata=update_metadata,
82 uid=uid)
83 if idxs != []:
84 # Filter out invalid indexes.
85 valid_indexes = self._catalog.indexes.keys()
86 idxs = [i for i in idxs if i in valid_indexes and i in self.delegatedIndexes]
87 else :
88 idxs = self.delegatedIndexes
89
90 if idxs :
91 self.solrAdd(object, idxs=idxs, uid=uid)
92
93 security.declarePrivate('unindexObject')
94 def unindexObject(self, object):
95 """Remove from catalog.
96 """
97 super(CatalogTool, self).unindexObject(object)
98 c = SolrConnection(self.solr_url)
99 url = self.__url(object)
100 c.delete(id=url)
101 txn = transaction.get()
102 txn.addAfterCommitHook(SolrTransactionHook(c))
103
104 InitializeClass(CatalogTool)
105
106
107 class DelegatedCatalog(Catalog) :
108 '''C'est ici qu'on délègue effectivement à Solr '''
109
110 def __init__(self, zcat, brains=None) :
111 Catalog.__init__(self, brains=brains)
112 self.zcat = zcat
113
114 def getDelegatedIndexes(self) :
115 return ('Title', 'Description', 'SearchableText') # <= TODO virer cette ligne
116 return self.zcat.delegatedIndexes
117
118 def delegateSearch(self, query, plan) :
119 '''
120 retours faux :
121 None signifie : pas de délégation, il faut continuer à interroger les autres index.
122 IISet() vide : pas de résultat lors de la délégation, on peut arrêter la recherche.
123 '''
124 indexes = set(plan).intersection(set(self.getDelegatedIndexes()))
125 delegatedQuery = {}
126 for i in indexes :
127 delegatedQuery[i] = query.pop(i)
128 plan.remove(i)
129 if not delegatedQuery :
130 return None
131 c = SolrConnection('http://localhost:8983/solr')
132 q =' AND '.join(['%s:"%s"' % item for item in delegatedQuery.items()])
133 resp = c.query(q, fields='id', rows=len(self))
134 c.close()
135 return IISet(filter(None, [self.uids.get(r['id']) for r in resp.results]))
136
137 def search(self, query, sort_index=None, reverse=0, limit=None, merge=1):
138 """Iterate through the indexes, applying the query to each one. If
139 merge is true then return a lazy result set (sorted if appropriate)
140 otherwise return the raw (possibly scored) results for later merging.
141 Limit is used in conjuntion with sorting or scored results to inform
142 the catalog how many results you are really interested in. The catalog
143 can then use optimizations to save time and memory. The number of
144 results is not guaranteed to fall within the limit however, you should
145 still slice or batch the results as usual."""
146
147 rs = None # resultset
148
149 # Indexes fulfill a fairly large contract here. We hand each
150 # index the query mapping we are given (which may be composed
151 # of some combination of web request, kw mappings or plain old dicts)
152 # and the index decides what to do with it. If the index finds work
153 # for itself in the query, it returns the results and a tuple of
154 # the attributes that were used. If the index finds nothing for it
155 # to do then it returns None.
156
157 # Canonicalize the request into a sensible query before passing it on
158 query = self.make_query(query)
159
160 cr = self.getCatalogPlan(query)
161 cr.start()
162
163 plan = cr.plan()
164 if not plan:
165 plan = self._sorted_search_indexes(query)
166
167 # délégation
168 rs = self.delegateSearch(query, plan)
169 if rs is not None and not rs :
170 return LazyCat([])
171
172 indexes = self.indexes.keys()
173 for i in plan:
174 if i not in indexes:
175 # We can have bogus keys or the plan can contain index names
176 # that have been removed in the meantime
177 continue
178
179 index = self.getIndex(i)
180 _apply_index = getattr(index, "_apply_index", None)
181 if _apply_index is None:
182 continue
183
184 cr.start_split(i)
185 limit_result = ILimitedResultIndex.providedBy(index)
186 if limit_result:
187 r = _apply_index(query, rs)
188 else:
189 r = _apply_index(query)
190
191 if r is not None:
192 r, u = r
193 # Short circuit if empty result
194 # BBB: We can remove the "r is not None" check in Zope 2.14
195 # once we don't need to support the "return everything" case
196 # anymore
197 if r is not None and not r:
198 cr.stop_split(i, result=None, limit=limit_result)
199 return LazyCat([])
200
201 # provide detailed info about the pure intersection time
202 intersect_id = i + '#intersection'
203 cr.start_split(intersect_id)
204 # weightedIntersection preserves the values from any mappings
205 # we get, as some indexes don't return simple sets
206 if hasattr(rs, 'items') or hasattr(r, 'items'):
207 _, rs = weightedIntersection(rs, r)
208 else:
209 rs = intersection(rs, r)
210
211 cr.stop_split(intersect_id)
212
213 # consider the time it takes to intersect the index result with
214 # the total resultset to be part of the index time
215 cr.stop_split(i, result=r, limit=limit_result)
216 if not rs:
217 break
218 else:
219 cr.stop_split(i, result=None, limit=limit_result)
220
221 # Try to deduce the sort limit from batching arguments
222 b_start = int(query.get('b_start', 0))
223 b_size = query.get('b_size', None)
224 if b_size is not None:
225 b_size = int(b_size)
226
227 if b_size is not None:
228 limit = b_start + b_size
229 elif limit and b_size is None:
230 b_size = limit
231
232 if rs is None:
233 # None of the indexes found anything to do with the query
234 # We take this to mean that the query was empty (an empty filter)
235 # and so we return everything in the catalog
236 warnings.warn('Your query %s produced no query restriction. '
237 'Currently the entire catalog content is returned. '
238 'In Zope 2.14 this will result in an empty LazyCat '
239 'to be returned.' % repr(cr.make_key(query)),
240 DeprecationWarning, stacklevel=3)
241
242 rlen = len(self)
243 if sort_index is None:
244 sequence, slen = self._limit_sequence(self.data.items(), rlen,
245 b_start, b_size)
246 result = LazyMap(self.instantiate, sequence, slen,
247 actual_result_count=rlen)
248 else:
249 cr.start_split('sort_on')
250 result = self.sortResults(
251 self.data, sort_index, reverse, limit, merge,
252 actual_result_count=rlen, b_start=b_start,
253 b_size=b_size)
254 cr.stop_split('sort_on', None)
255 elif rs:
256 # We got some results from the indexes.
257 # Sort and convert to sequences.
258 # XXX: The check for 'values' is really stupid since we call
259 # items() and *not* values()
260 rlen = len(rs)
261 if sort_index is None and hasattr(rs, 'items'):
262 # having a 'items' means we have a data structure with
263 # scores. Build a new result set, sort it by score, reverse
264 # it, compute the normalized score, and Lazify it.
265
266 if not merge:
267 # Don't bother to sort here, return a list of
268 # three tuples to be passed later to mergeResults
269 # note that data_record_normalized_score_ cannot be
270 # calculated and will always be 1 in this case
271 getitem = self.__getitem__
272 result = [(score, (1, score, rid), getitem)
273 for rid, score in rs.items()]
274 else:
275 cr.start_split('sort_on')
276
277 rs = rs.byValue(0) # sort it by score
278 max = float(rs[0][0])
279
280 # Here we define our getter function inline so that
281 # we can conveniently store the max value as a default arg
282 # and make the normalized score computation lazy
283 def getScoredResult(item, max=max, self=self):
284 """
285 Returns instances of self._v_brains, or whatever is
286 passed into self.useBrains.
287 """
288 score, key = item
289 r=self._v_result_class(self.data[key])\
290 .__of__(aq_parent(self))
291 r.data_record_id_ = key
292 r.data_record_score_ = score
293 r.data_record_normalized_score_ = int(100. * score / max)
294 return r
295
296 sequence, slen = self._limit_sequence(rs, rlen, b_start,
297 b_size)
298 result = LazyMap(getScoredResult, sequence, slen,
299 actual_result_count=rlen)
300 cr.stop_split('sort_on', None)
301
302 elif sort_index is None and not hasattr(rs, 'values'):
303 # no scores
304 if hasattr(rs, 'keys'):
305 rs = rs.keys()
306 sequence, slen = self._limit_sequence(rs, rlen, b_start,
307 b_size)
308 result = LazyMap(self.__getitem__, sequence, slen,
309 actual_result_count=rlen)
310 else:
311 # sort. If there are scores, then this block is not
312 # reached, therefore 'sort-on' does not happen in the
313 # context of a text index query. This should probably
314 # sort by relevance first, then the 'sort-on' attribute.
315 cr.start_split('sort_on')
316 result = self.sortResults(rs, sort_index, reverse, limit,
317 merge, actual_result_count=rlen, b_start=b_start,
318 b_size=b_size)
319 cr.stop_split('sort_on', None)
320 else:
321 # Empty result set
322 result = LazyCat([])
323 cr.stop()
324 return result