X-Git-Url: https://scm.cri.ensmp.fr/git/Utf8Splitter.git/blobdiff_plain/d25275c7a6284a8da05e40f231f2e9a3a30d93b5:/Utf8Splitter.py..50b88da70954fb7827784be1ce14d6f75ae9072e:/Products/Utf8Splitter/static/gitweb.js diff --git a/Utf8Splitter.py b/Utf8Splitter.py deleted file mode 100644 index 8280825..0000000 --- a/Utf8Splitter.py +++ /dev/null @@ -1,177 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Inspiré de l'UnicodeSplitter de Plone 2.1, avec un aller-retour Unicode <-> UTF-8 pour le découpage. - -$Id: Utf8Splitter.py 16 2009-08-31 11:36:17Z pin $ -$URL: http://svn.cri.ensmp.fr/svn/Utf8Splitter/trunk/Utf8Splitter.py $ - -""" - -# Python -import re -from htmlentitydefs import name2codepoint -from unicodedata import decomposition -from string import printable -import logging -from types import UnicodeType -console = logging.getLogger('Utf8Splitter') - -# Zope -from Products.ZCTextIndex.ISplitter import ISplitter -from Products.ZCTextIndex.PipelineFactory import element_factory - -rx = re.compile(r"\w+", re.UNICODE) -rxGlob = re.compile(r"\w+[\w*?]*", re.UNICODE) - -rtag = re.compile(r"<[^<>]*>") -rent = re.compile(r"&(?P[A-Za-z]+);") - -_printable = dict([(c, True) for c in printable]) -isPrintable = _printable.has_key - -class Utf8Splitter: - """Plain-text UTF-8 whitespace splitter - """ - __implements__ = ISplitter - - def process(self, lst, wordpat=rx): - result = [] - for s in lst: - result += wordpat.findall(unicode(s, 'utf-8', errors='ignore')) - return [r.encode('utf-8') for r in result] - - def processGlob(self, lst): - return self.process(lst, rxGlob) - - - -class Utf8HTMLAwareSplitter : - """HTML-aware UTF-8 whitespace splitter - """ - __implements__ = ISplitter - - def process(self, lst, wordpat=rx): - result = [] - for s in lst: - s = rtag.sub(' ', s) - s = rent.sub(_convertEnt, s) - s = s.decode('utf-8', 'ignore') - - result += wordpat.findall(s) - - return [r.encode('utf-8') for r in result] - - def processGlob(self, lst): - return self.process(lst, rxGlob) - - - -class DesaccUtf8Splitter(Utf8Splitter): - """Plain-text UTF-8 whitespace splitter with accents removal - """ - def process(self, lst, wordpat=rx): - return Utf8Splitter.process(self, [_desacc(s) for s in lst], wordpat) - - - -class DesaccUtf8HTMLAwareSplitter(Utf8HTMLAwareSplitter): - """HTML-aware UTF-8 whitespace splitter with accents removal - """ - def process(self, lst, wordpat=rx): - result = [] - for s in lst: - s = rtag.sub(' ', s) - s = rent.sub(_convertEnt, s) - s = _desacc(s) - - result += wordpat.findall(s) - - return [r.encode('utf-8') for r in result] - -class _Utf8Utils(object) : - - _singleton = None - - def __new__(cls) : - if cls._singleton is None : - cls._singleton = object.__new__(cls) - return cls._singleton - - - def __init__(self) : - self._cache = {} - - @staticmethod - def convertEnt(m): - """Conversion d'une entité HTML en sa représentation UTF-8 - """ - return unichr(name2codepoint.get(m.group('entName'), 32)).encode('utf-8') - - def udesacc(self, uchaine) : - ret = [] - for uc in uchaine : - ret.append(self._cache.get(uc) or self._recurseDecomposition(uc)) - - return u''.join(ret) - - def desacc(self, chaine): - """Désaccentuation d'une chaîne UTF-8 - """ - try : - uchaine = chaine.decode('utf-8', 'ignore') - except UnicodeEncodeError : - if type(chaine) == UnicodeType : - console.warn('already unicode value passed to desacc: %r' % chaine) - uchaine = chaine - else : - raise - ret = self.udesacc(uchaine) - return ret.encode('utf-8') - - - def _recurseDecomposition(self, uc): - deco = decomposition(uc).split() - fullDeco = [] - if deco : - while (deco) : - code = deco.pop() - if code.startswith('<') : - continue - c = unichr(int(code, 16)) - subDeco = decomposition(c).split() - if subDeco : - deco.extend(subDeco) - else : - fullDeco.append(c) - fullDeco.reverse() - else : - fullDeco.append(uc) - - fullDeco = u''.join(filter(lambda c : isPrintable(c), fullDeco)) - self._cache[uc] = fullDeco - return fullDeco - -Utf8Utils = _Utf8Utils() - -_desacc = Utf8Utils.desacc -_convertEnt = Utf8Utils.convertEnt - -try: - element_factory.registerFactory( 'Word Splitter', - 'UTF-8 Whitespace splitter', Utf8Splitter) - - element_factory.registerFactory( 'Word Splitter', - 'UTF-8 HTML Aware splitter', Utf8HTMLAwareSplitter) - - element_factory.registerFactory( 'Word Splitter', - 'UTF-8 Whitespace splitter with accents removal', DesaccUtf8Splitter) - - element_factory.registerFactory( 'Word Splitter', - 'UTF-8 HTML Aware splitter with accents removal', DesaccUtf8HTMLAwareSplitter) - -except ValueError: - # in case the splitter is already registred, ValueError is raised - pass - -