+++ /dev/null
-# -*- coding: utf-8 -*-
-
-"""
-Inspiré de l'UnicodeSplitter de Plone 2.1, avec un aller-retour Unicode <-> UTF-8 pour le découpage.
-
-$Id: Utf8Splitter.py 16 2009-08-31 11:36:17Z pin $
-$URL: http://svn.cri.ensmp.fr/svn/Utf8Splitter/trunk/Utf8Splitter.py $
-
-"""
-
-# Python
-import re
-from htmlentitydefs import name2codepoint
-from unicodedata import decomposition
-from string import printable
-import logging
-from types import UnicodeType
-console = logging.getLogger('Utf8Splitter')
-
-# Zope
-from Products.ZCTextIndex.ISplitter import ISplitter
-from Products.ZCTextIndex.PipelineFactory import element_factory
-
-rx = re.compile(r"\w+", re.UNICODE)
-rxGlob = re.compile(r"\w+[\w*?]*", re.UNICODE)
-
-rtag = re.compile(r"<[^<>]*>")
-rent = re.compile(r"&(?P<entName>[A-Za-z]+);")
-
-_printable = dict([(c, True) for c in printable])
-isPrintable = _printable.has_key
-
-class Utf8Splitter:
- """Plain-text UTF-8 whitespace splitter
- """
- __implements__ = ISplitter
-
- def process(self, lst, wordpat=rx):
- result = []
- for s in lst:
- result += wordpat.findall(unicode(s, 'utf-8', errors='ignore'))
- return [r.encode('utf-8') for r in result]
-
- def processGlob(self, lst):
- return self.process(lst, rxGlob)
-
-
-
-class Utf8HTMLAwareSplitter :
- """HTML-aware UTF-8 whitespace splitter
- """
- __implements__ = ISplitter
-
- def process(self, lst, wordpat=rx):
- result = []
- for s in lst:
- s = rtag.sub(' ', s)
- s = rent.sub(_convertEnt, s)
- s = s.decode('utf-8', 'ignore')
-
- result += wordpat.findall(s)
-
- return [r.encode('utf-8') for r in result]
-
- def processGlob(self, lst):
- return self.process(lst, rxGlob)
-
-
-
-class DesaccUtf8Splitter(Utf8Splitter):
- """Plain-text UTF-8 whitespace splitter with accents removal
- """
- def process(self, lst, wordpat=rx):
- return Utf8Splitter.process(self, [_desacc(s) for s in lst], wordpat)
-
-
-
-class DesaccUtf8HTMLAwareSplitter(Utf8HTMLAwareSplitter):
- """HTML-aware UTF-8 whitespace splitter with accents removal
- """
- def process(self, lst, wordpat=rx):
- result = []
- for s in lst:
- s = rtag.sub(' ', s)
- s = rent.sub(_convertEnt, s)
- s = _desacc(s)
-
- result += wordpat.findall(s)
-
- return [r.encode('utf-8') for r in result]
-
-class _Utf8Utils(object) :
-
- _singleton = None
-
- def __new__(cls) :
- if cls._singleton is None :
- cls._singleton = object.__new__(cls)
- return cls._singleton
-
-
- def __init__(self) :
- self._cache = {}
-
- @staticmethod
- def convertEnt(m):
- """Conversion d'une entité HTML en sa représentation UTF-8
- """
- return unichr(name2codepoint.get(m.group('entName'), 32)).encode('utf-8')
-
- def udesacc(self, uchaine) :
- ret = []
- for uc in uchaine :
- ret.append(self._cache.get(uc) or self._recurseDecomposition(uc))
-
- return u''.join(ret)
-
- def desacc(self, chaine):
- """Désaccentuation d'une chaîne UTF-8
- """
- try :
- uchaine = chaine.decode('utf-8', 'ignore')
- except UnicodeEncodeError :
- if type(chaine) == UnicodeType :
- console.warn('already unicode value passed to desacc: %r' % chaine)
- uchaine = chaine
- else :
- raise
- ret = self.udesacc(uchaine)
- return ret.encode('utf-8')
-
-
- def _recurseDecomposition(self, uc):
- deco = decomposition(uc).split()
- fullDeco = []
- if deco :
- while (deco) :
- code = deco.pop()
- if code.startswith('<') :
- continue
- c = unichr(int(code, 16))
- subDeco = decomposition(c).split()
- if subDeco :
- deco.extend(subDeco)
- else :
- fullDeco.append(c)
- fullDeco.reverse()
- else :
- fullDeco.append(uc)
-
- fullDeco = u''.join(filter(lambda c : isPrintable(c), fullDeco))
- self._cache[uc] = fullDeco
- return fullDeco
-
-Utf8Utils = _Utf8Utils()
-
-_desacc = Utf8Utils.desacc
-_convertEnt = Utf8Utils.convertEnt
-
-try:
- element_factory.registerFactory( 'Word Splitter',
- 'UTF-8 Whitespace splitter', Utf8Splitter)
-
- element_factory.registerFactory( 'Word Splitter',
- 'UTF-8 HTML Aware splitter', Utf8HTMLAwareSplitter)
-
- element_factory.registerFactory( 'Word Splitter',
- 'UTF-8 Whitespace splitter with accents removal', DesaccUtf8Splitter)
-
- element_factory.registerFactory( 'Word Splitter',
- 'UTF-8 HTML Aware splitter with accents removal', DesaccUtf8HTMLAwareSplitter)
-
-except ValueError:
- # in case the splitter is already registred, ValueError is raised
- pass
-
-