
    =wg'                         d dl mZ d dlmZ d dlmZ d dlmZ d dlm	Z	m
Z
  G d de      Z G d d	e      Z G d
 de      Zy)    )Filter)integer_types)double_metaphone)stem)	lfu_cacheunbound_cachec                   \    e Zd ZdZ eee      ZdZe	dddfdZ
d Zd Zd	 Zd
 Zd Zd Zy)
StemFilteraH  Stems (removes suffixes from) the text of tokens using the Porter
    stemming algorithm. Stemming attempts to reduce multiple forms of the same
    root word (for example, "rendering", "renders", "rendered", etc.) to a
    single word in the index.

    >>> stemmer = RegexTokenizer() | StemFilter()
    >>> [token.text for token in stemmer("fundamentally willows")]
    ["fundament", "willow"]

    You can pass your own stemming function to the StemFilter. The default
    is the Porter stemming algorithm for English.

    >>> stemfilter = StemFilter(stem_function)

    You can also use one of the Snowball stemming functions by passing the
    `lang` keyword argument.

    >>> stemfilter = StemFilter(lang="ru")

    The list of available languages is in `whoosh.lang.languages`.
    You can use :func:`whoosh.lang.has_stemmer` to check if a given language has
    a stemming function available.

    By default, this class wraps an LRU cache around the stemming function. The
    ``cachesize`` keyword argument sets the size of the cache. To make the
    cache unbounded (the class caches every input), use ``cachesize=-1``. To
    disable caching, use ``cachesize=None``.

    If you compile and install the py-stemmer library, the
    :class:`PyStemmerFilter` provides slightly easier access to the language
    stemmers in that library.
    )stemfnignoreTNP  c                     || _         || _        |
t               n
t        |      | _        || _        | j                          y)a	  
        :param stemfn: the function to use for stemming.
        :param lang: if not None, overrides the stemfn with a language stemmer
            from the ``whoosh.lang.snowball`` package.
        :param ignore: a set/list of words that should not be stemmed. This is
            converted into a frozenset. If you omit this argument, all tokens
            are stemmed.
        :param cachesize: the maximum number of words to cache. Use ``-1`` for
            an unbounded cache, or ``None`` for no caching.
        N)r   lang	frozensetr   	cachesizeclear)selfr   r   r   r   s        L/var/www/horilla/myenv/lib/python3.12/site-packages/whoosh/analysis/morph.py__init__zStemFilter.__init__I   s6     	%+^ik69J"

    c                 z    t        | j                  D cg c]  }|dk7  r|| j                  |   f c}      S c c}w N_stemdict__dict__r   ks     r   __getstate__zStemFilter.__getstate__\   sC     DMM 'qg q)* ' ( 	( '   8c                     d|vrd| _         d|v r|d   | _        nd|vrt               | _        d|vrd | _        d|v r|d= | j                  j                  |       | j                          y )Nr   r   ignoresr   r   cache)r   r   r   r   r   updater   r   states     r   __setstate__zStemFilter.__setstate__b   sq     e#"DN	*DKU"#+DKDIegU#

r   c                 h   | j                   rddlm}  || j                         }n| j                  }t	        | j
                  t              r`| j
                  dk7  rQ| j
                  dk  rt        |      | _        y | j
                  dkD  r! t        | j
                        |      | _        y y || _        y )Nr   )stemmer_for_language   )
r   whoosh.langr)   r   
isinstancer   r   r   r   r   )r   r)   r   s      r   r   zStemFilter.cleart   s    998)$))4F[[Fdnnm419L~~!*62
!#6Yt~~6v>
 $  DJr   c                 V    | j                   dk  ry | j                  j                         S )Nr*   )r   r   
cache_infor   s    r   r.   zStemFilter.cache_info   s$    >>Qzz$$&&r   c                 p    |xr3 | j                   |j                   u xr | j                  |j                  k(  S N)	__class__r   r   others     r   __eq__zStemFilter.__eq__   s3     0$..EOO; 0KK5<</	1r   c              #      K   | j                   }| j                  }|D ]/  }|j                  s|j                  }||vr ||      |_        | 1 y wr1   )r   r   stoppedtext)r   tokensr   r   tr8   s         r   __call__zStemFilter.__call__   sO      	A99vvv%#D\AFG	s   AA)__name__
__module____qualname____doc__r   objectlist__inittypes__is_morphr   r   r   r'   r   r.   r5   r;    r   r   r
   r
   #   sG    B t4MH"de &($ '
1	r   r
   c                   6    e Zd ZdZd	dZd Zd Zd Zd Zd Z	y)
PyStemmerFilterzThis is a simple subclass of StemFilter that works with the py-stemmer
    third-party library. You must have the py-stemmer library installed to use
    this filter.

    >>> PyStemmerFilter("spanish")
    Nc                     || _         |
t               n
t        |      | _        || _        | j	                         | _        y)a  
        :param lang: a string identifying the stemming algorithm to use. You
            can get a list of available algorithms by with the
            :meth:`PyStemmerFilter.algorithms` method. The identification
            strings are directly from the py-stemmer library.
        :param ignore: a set/list of words that should not be stemmed. This is
            converted into a frozenset. If you omit this argument, all tokens
            are stemmed.
        :param cachesize: the maximum number of words to cache.
        N)r   r   r   r   _get_stemmer_fnr   )r   r   r   r   s       r   r   zPyStemmerFilter.__init__   s6     	%+^ik69J"))+
r   c                 *    ddl }|j                         S )zZReturns a list of stemming algorithms provided by the py-stemmer
        library.
        r   N)Stemmer
algorithms)r   rJ   s     r   rK   zPyStemmerFilter.algorithms   s    
 	!!##r   c                      y r1   rD   r/   s    r   r.   zPyStemmerFilter.cache_info   s    r   c                 z    dd l }|j                  | j                        }| j                  |_        |j                  S )Nr   )rJ   r   r   maxCacheSizestemWord)r   rJ   stemmers      r   rH   zPyStemmerFilter._get_stemmer_fn   s/    //$)),#~~r   c                 z    t        | j                  D cg c]  }|dk7  r|| j                  |   f c}      S c c}w r   r   r   s     r   r   zPyStemmerFilter.__getstate__   sC     DMM &qW q)* & ' 	' &r    c                     d|vrd| _         d|v r|d   | _        nd|vrt               | _        d|v r|d= | j                  j	                  |       | j                         | _        y )Nr   '  r"   r   r#   )r   r   r   r   r$   rH   r   r%   s     r   r'   zPyStemmerFilter.__setstate__   si     e#"DN	*DKU"#+DKegU#))+
r   )englishNrS   )
r<   r=   r>   r?   r   rK   r.   rH   r   r'   rD   r   r   rF   rF      s%    ,"$ ',r   rF   c                   (    e Zd ZdZdZddZd Zd Zy)DoubleMetaphoneFilterae  Transforms the text of the tokens using Lawrence Philips's Double
    Metaphone algorithm. This algorithm attempts to encode words in such a way
    that similar-sounding words reduce to the same code. This may be useful for
    fields containing the names of people and places, and other uses where
    tolerance of spelling differences is desireable.
    Tc                 .    || _         || _        || _        y)ad  
        :param primary_boost: the boost to apply to the token containing the
            primary code.
        :param secondary_boost: the boost to apply to the token containing the
            secondary code, if any.
        :param combine: if True, the original unencoded tokens are kept in the
            stream, preceding the encoded tokens.
        N)primary_boostsecondary_boostcombine)r   rX   rY   rZ   s       r   r   zDoubleMetaphoneFilter.__init__   s     +.r   c                 p    |xr3 | j                   |j                   u xr | j                  |j                  k(  S r1   )r2   rX   r3   s     r   r5   zDoubleMetaphoneFilter.__eq__   s8     >NNeoo5>&&%*=*==	?r   c              #     K   | j                   }| j                  }| j                  }|D ][  }|r| t        |j                        \  }}|j
                  }|r||_        ||z  |_        | |sG||_        ||z  |_        | ] y wr1   )rX   rY   rZ   r   r8   boost)	r   r9   rX   rY   rZ   r:   primary	secondarybs	            r   r;   zDoubleMetaphoneFilter.__call__   s     **..,, 	A!1!&&!9GYA m+"o-	s   A-B0BN)g      ?g      ?F)r<   r=   r>   r?   rC   r   r5   r;   rD   r   r   rV   rV      s     H?
r   rV   N)whoosh.analysis.filtersr   whoosh.compatr   whoosh.lang.dmetaphoner   whoosh.lang.porterr   whoosh.util.cacher   r   r
   rF   rV   rD   r   r   <module>rf      s@   8 + ' 3 # 6r rj@,j @,F0F 0r   