Retrait du nettoyage automatique de html : encore une lubie de geek…
[Plinn.git] / catalog.py
1 # -*- coding: utf-8 -*-
2 from App.class_init import InitializeClass
3 from AccessControl import ClassSecurityInfo
4 from Products.CMFCore.interfaces import IIndexableObject
5 from Products.CMFCore.CatalogTool import CatalogTool as BaseCatalogTool
6 from Products.CMFCore.CatalogTool import IndexableObjectWrapper
7 from Products.PageTemplates.PageTemplateFile import PageTemplateFile
8 from Products.CMFCore.permissions import ModifyPortalContent, ManagePortal
9 from zope.component import queryMultiAdapter
10 from Products.ZCatalog.Catalog import Catalog
11 import transaction
12 from solr import *
13
14 # imports for Catalog class
15 from Products.PluginIndexes.interfaces import ILimitedResultIndex
16 from Products.ZCatalog.Lazy import LazyMap, LazyCat, LazyValues
17 from BTrees.IIBTree import intersection, IISet
18 from BTrees.IIBTree import weightedIntersection
19 import warnings
20
21 _VOLATILE_SOLR_NAME = '_v_solrConnection'
22
23 class SolrTransactionHook :
24 ''' commit solr couplé sur le commit de la ZODB '''
25 def __init__(self, context, con) :
26 self.context = context
27 self.con = con
28
29 def __call__(self, status) :
30 if status :
31 self.con.commit()
32 self.con.close()
33 else :
34 self.con.close()
35 try :
36 delattr(self.context, _VOLATILE_SOLR_NAME)
37 except AttributeError :
38 pass
39
40 class CatalogTool(BaseCatalogTool) :
41 meta_type = 'Plinn Catalog'
42 security = ClassSecurityInfo()
43 manage_options = (BaseCatalogTool.manage_options[:5] +
44 ({'label' : 'Solr', 'action' : 'manage_solr'},) +
45 BaseCatalogTool.manage_options[5:])
46 manage_solr = PageTemplateFile('www/manage_solr.pt', globals(), __name__='manage_solr')
47
48
49
50 def __init__(self, idxs=[]) :
51 super(CatalogTool, self).__init__()
52 self._catalog = DelegatedCatalog(self)
53 self.solr_url = 'http://localhost:8983/solr'
54 self.delegatedIndexes = ('Title', 'Description', 'SearchableText')
55
56 security.declarePublic('getDelegatedIndexes')
57 def getDelegatedIndexes(self) :
58 """ read the method name """
59 return self.delegatedIndexes
60
61 security.declareProtected(ManagePortal, 'setDelegatedIndexes')
62 def setDelegatedIndexes(self, indexes, REQUEST=None) :
63 """setDelegatedIndexes documentation"""
64 self.delegatedIndexes = tuple([i.strip() for i in indexes if i.strip()])
65 if REQUEST :
66 REQUEST.RESPONSE.redirect(self.absolute_url() + '/manage_solr?manage_tabs_message=Saved changes.')
67
68 def _getSolrConnection(self) :
69 if not hasattr(self, _VOLATILE_SOLR_NAME) :
70 con = SolrConnection(self.solr_url)
71 setattr(self, _VOLATILE_SOLR_NAME, con)
72 txn = transaction.get()
73 txn.addAfterCommitHook(SolrTransactionHook(self, con))
74 return getattr(self, _VOLATILE_SOLR_NAME)
75
76 security.declarePrivate('solrAdd')
77 def solrAdd(self, w, uid, idxs) :
78 idxs = idxs if idxs else self.delegatedIndexes
79 # Filter out delegated indexes
80 idxs = [i for i in idxs if i in self.delegatedIndexes]
81 data = {'id' : uid}
82 for name in idxs :
83 attr = getattr(w, name, '')
84 data[name] = attr() if callable(attr) else attr
85 c = self._getSolrConnection()
86 c.add(**data)
87
88 # PortalCatalog api overloads
89 def catalog_object(self, obj, uid=None, idxs=None, update_metadata=1,
90 pghandler=None):
91 # Wraps the object with workflow and accessibility
92 # information just before cataloging.
93 if IIndexableObject.providedBy(obj):
94 w = obj
95 else:
96 w = queryMultiAdapter( (obj, self), IIndexableObject )
97 if w is None:
98 # BBB
99 w = IndexableObjectWrapper(obj, self)
100
101 idxs_ = idxs
102 if idxs:
103 # Filter out invalid indexes.
104 valid_indexes = self._catalog.indexes.keys()
105 idxs_ = [i for i in idxs if i in valid_indexes]
106
107 super(CatalogTool, self).catalog_object(w, uid, idxs_, update_metadata, pghandler)
108 self.solrAdd(w, uid, idxs)
109
110 security.declarePrivate('reindexObject')
111 def reindexObject(self, object, idxs=[], update_metadata=1, uid=None):
112 """Update catalog after object data has changed.
113
114 The optional idxs argument is a list of specific indexes
115 to update (all of them by default).
116
117 The update_metadata flag controls whether the object's
118 metadata record is updated as well.
119
120 If a non-None uid is passed, it will be used as the catalog uid
121 for the object instead of its physical path.
122 """
123 if uid is None:
124 uid = self.__url(object)
125
126 self.catalog_object(object, uid, idxs, update_metadata)
127
128 security.declarePrivate('unindexObject')
129 def unindexObject(self, object):
130 """Remove from catalog.
131 """
132 super(CatalogTool, self).unindexObject(object)
133 c = self._getSolrConnection()
134 url = self.__url(object)
135 c.delete(id=url)
136
137 InitializeClass(CatalogTool)
138
139
140 class DelegatedCatalog(Catalog) :
141 '''C'est ici qu'on délègue effectivement à Solr '''
142
143 def __init__(self, zcat, brains=None) :
144 Catalog.__init__(self, brains=brains)
145 self.zcat = zcat
146
147 def delegateSearch(self, query, plan) :
148 '''
149 retours faux :
150 None signifie : pas de délégation, il faut continuer à interroger les autres index.
151 IISet() vide : pas de résultat lors de la délégation, on peut arrêter la recherche.
152 '''
153 indexes = set(query.keys()).intersection(set(self.zcat.delegatedIndexes))
154 if not indexes :
155 return None
156 delegatedQuery = {}
157 for i in indexes :
158 delegatedQuery[i] = query.pop(i)
159 try : plan.remove(i)
160 except ValueError : pass
161 c = SolrConnection(self.zcat.solr_url)
162 q =' AND '.join(['%s:"%s"' % item for item in delegatedQuery.items()])
163 resp = c.query(q, fields='id', rows=len(self))
164 c.close()
165 return IISet(filter(None, [self.uids.get(r['id']) for r in resp.results]))
166
167 def search(self, query, sort_index=None, reverse=0, limit=None, merge=1):
168 """Iterate through the indexes, applying the query to each one. If
169 merge is true then return a lazy result set (sorted if appropriate)
170 otherwise return the raw (possibly scored) results for later merging.
171 Limit is used in conjuntion with sorting or scored results to inform
172 the catalog how many results you are really interested in. The catalog
173 can then use optimizations to save time and memory. The number of
174 results is not guaranteed to fall within the limit however, you should
175 still slice or batch the results as usual."""
176
177 rs = None # resultset
178
179 # Indexes fulfill a fairly large contract here. We hand each
180 # index the query mapping we are given (which may be composed
181 # of some combination of web request, kw mappings or plain old dicts)
182 # and the index decides what to do with it. If the index finds work
183 # for itself in the query, it returns the results and a tuple of
184 # the attributes that were used. If the index finds nothing for it
185 # to do then it returns None.
186
187 # Canonicalize the request into a sensible query before passing it on
188 query = self.make_query(query)
189
190 cr = self.getCatalogPlan(query)
191 cr.start()
192
193 plan = cr.plan()
194 if not plan:
195 plan = self._sorted_search_indexes(query)
196
197 # délégation
198 rs = self.delegateSearch(query, plan)
199 if rs is not None and not rs :
200 return LazyCat([])
201
202 indexes = self.indexes.keys()
203 for i in plan:
204 if i not in indexes:
205 # We can have bogus keys or the plan can contain index names
206 # that have been removed in the meantime
207 continue
208
209 index = self.getIndex(i)
210 _apply_index = getattr(index, "_apply_index", None)
211 if _apply_index is None:
212 continue
213
214 cr.start_split(i)
215 limit_result = ILimitedResultIndex.providedBy(index)
216 if limit_result:
217 r = _apply_index(query, rs)
218 else:
219 r = _apply_index(query)
220
221 if r is not None:
222 r, u = r
223 # Short circuit if empty result
224 # BBB: We can remove the "r is not None" check in Zope 2.14
225 # once we don't need to support the "return everything" case
226 # anymore
227 if r is not None and not r:
228 cr.stop_split(i, result=None, limit=limit_result)
229 return LazyCat([])
230
231 # provide detailed info about the pure intersection time
232 intersect_id = i + '#intersection'
233 cr.start_split(intersect_id)
234 # weightedIntersection preserves the values from any mappings
235 # we get, as some indexes don't return simple sets
236 if hasattr(rs, 'items') or hasattr(r, 'items'):
237 _, rs = weightedIntersection(rs, r)
238 else:
239 rs = intersection(rs, r)
240
241 cr.stop_split(intersect_id)
242
243 # consider the time it takes to intersect the index result with
244 # the total resultset to be part of the index time
245 cr.stop_split(i, result=r, limit=limit_result)
246 if not rs:
247 break
248 else:
249 cr.stop_split(i, result=None, limit=limit_result)
250
251 # Try to deduce the sort limit from batching arguments
252 b_start = int(query.get('b_start', 0))
253 b_size = query.get('b_size', None)
254 if b_size is not None:
255 b_size = int(b_size)
256
257 if b_size is not None:
258 limit = b_start + b_size
259 elif limit and b_size is None:
260 b_size = limit
261
262 if rs is None:
263 # None of the indexes found anything to do with the query
264 # We take this to mean that the query was empty (an empty filter)
265 # and so we return everything in the catalog
266 warnings.warn('Your query %s produced no query restriction. '
267 'Currently the entire catalog content is returned. '
268 'In Zope 2.14 this will result in an empty LazyCat '
269 'to be returned.' % repr(cr.make_key(query)),
270 DeprecationWarning, stacklevel=3)
271
272 rlen = len(self)
273 if sort_index is None:
274 sequence, slen = self._limit_sequence(self.data.items(), rlen,
275 b_start, b_size)
276 result = LazyMap(self.instantiate, sequence, slen,
277 actual_result_count=rlen)
278 else:
279 cr.start_split('sort_on')
280 result = self.sortResults(
281 self.data, sort_index, reverse, limit, merge,
282 actual_result_count=rlen, b_start=b_start,
283 b_size=b_size)
284 cr.stop_split('sort_on', None)
285 elif rs:
286 # We got some results from the indexes.
287 # Sort and convert to sequences.
288 # XXX: The check for 'values' is really stupid since we call
289 # items() and *not* values()
290 rlen = len(rs)
291 if sort_index is None and hasattr(rs, 'items'):
292 # having a 'items' means we have a data structure with
293 # scores. Build a new result set, sort it by score, reverse
294 # it, compute the normalized score, and Lazify it.
295
296 if not merge:
297 # Don't bother to sort here, return a list of
298 # three tuples to be passed later to mergeResults
299 # note that data_record_normalized_score_ cannot be
300 # calculated and will always be 1 in this case
301 getitem = self.__getitem__
302 result = [(score, (1, score, rid), getitem)
303 for rid, score in rs.items()]
304 else:
305 cr.start_split('sort_on')
306
307 rs = rs.byValue(0) # sort it by score
308 max = float(rs[0][0])
309
310 # Here we define our getter function inline so that
311 # we can conveniently store the max value as a default arg
312 # and make the normalized score computation lazy
313 def getScoredResult(item, max=max, self=self):
314 """
315 Returns instances of self._v_brains, or whatever is
316 passed into self.useBrains.
317 """
318 score, key = item
319 r=self._v_result_class(self.data[key])\
320 .__of__(aq_parent(self))
321 r.data_record_id_ = key
322 r.data_record_score_ = score
323 r.data_record_normalized_score_ = int(100. * score / max)
324 return r
325
326 sequence, slen = self._limit_sequence(rs, rlen, b_start,
327 b_size)
328 result = LazyMap(getScoredResult, sequence, slen,
329 actual_result_count=rlen)
330 cr.stop_split('sort_on', None)
331
332 elif sort_index is None and not hasattr(rs, 'values'):
333 # no scores
334 if hasattr(rs, 'keys'):
335 rs = rs.keys()
336 sequence, slen = self._limit_sequence(rs, rlen, b_start,
337 b_size)
338 result = LazyMap(self.__getitem__, sequence, slen,
339 actual_result_count=rlen)
340 else:
341 # sort. If there are scores, then this block is not
342 # reached, therefore 'sort-on' does not happen in the
343 # context of a text index query. This should probably
344 # sort by relevance first, then the 'sort-on' attribute.
345 cr.start_split('sort_on')
346 result = self.sortResults(rs, sort_index, reverse, limit,
347 merge, actual_result_count=rlen, b_start=b_start,
348 b_size=b_size)
349 cr.stop_split('sort_on', None)
350 else:
351 # Empty result set
352 result = LazyCat([])
353 cr.stop()
354 return result