
    =wg,              	       >   d dl mZmZ d dlmZ d dlmZ d dlmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ  G d de      Z G d de      ZddZd dZd!dZedfdZeedddfdZeedddeddfdZdedddddddf	dZeddfdZy)"    )
ComposableCompositionError)	Tokenizer)LowercaseFilter)
StopFilter
STOP_WORDS)
StemFilter)IntraWordFilter)default_pattern)CommaSeparatedTokenizer)IDTokenizerRegexTokenizer)SpaceSeparatedTokenizer)stemc                   (    e Zd ZdZd Zd Zd Zd Zy)Analyzerz( Abstract base class for analyzers.
    c                 4    d| j                   j                  z  S )Nz%s())	__class____name__selfs    P/var/www/horilla/myenv/lib/python3.12/site-packages/whoosh/analysis/analyzers.py__repr__zAnalyzer.__repr__0   s    ////    c                 p    |xr3 | j                   |j                   u xr | j                  |j                  k(  S N)r   __dict__r   others     r   __eq__zAnalyzer.__eq__3   s4     4NNeoo54MMU^^3	5r   c                     t         r   )NotImplementedError)r   valuekwargss      r   __call__zAnalyzer.__call__8   s    !!r   c                      y r    r   s    r   cleanzAnalyzer.clean;   s    r   N)r   
__module____qualname____doc__r   r!   r&   r)   r(   r   r   r   r   ,   s    05
"r   r   c                   >    e Zd Zd Zd Zd
dZd Zd Zd Zd Z	d Z
y	)CompositeAnalyzerc                 :   g | _         |D ]S  }t        |t              r&| j                   j                  |j                          9| j                   j	                  |       U | j                   dd  D ]*  }t        |t
              st        d| j                   z         y )N   z;Only one tokenizer allowed at the start of the analyzer: %r)items
isinstancer.   extendappendr   r   )r   composablescompitems       r   __init__zCompositeAnalyzer.__init__@   s    
 	(D$ 12

!!$**-

!!$'		( JJqrN 	LD$	*& (>@D

(K L L	Lr   c                 z    | j                   j                  ddj                  d | j                  D              dS )N(z, c              3   2   K   | ]  }t        |        y wr   )repr.0r7   s     r   	<genexpr>z-CompositeAnalyzer.__repr__.<locals>.<genexpr>S   s     $GDT$Z$Gs   ))r   r   joinr1   r   s    r   r   zCompositeAnalyzer.__repr__Q   s1    >>2299$GDJJ$GGI 	Ir   c                     | j                   } |d   |fi |}|dd  D ]%  }|rt        |d      r|j                  r ||      }' |S )Nr   r0   is_morph)r1   hasattrrC   )r   r$   no_morphr%   r1   genr7   s          r   r&   zCompositeAnalyzer.__call__U   sW    

eAhu''!"I 	 Dz!:t}}3i	  
r   c                 8    | j                   j                  |      S r   )r1   __getitem__r   r7   s     r   rH   zCompositeAnalyzer.__getitem___   s    zz%%d++r   c                 ,    t        | j                        S r   )lenr1   r   s    r   __len__zCompositeAnalyzer.__len__b   s    4::r   c                 p    |xr3 | j                   |j                   u xr | j                  |j                  k(  S r   )r   r1   r   s     r   r!   zCompositeAnalyzer.__eq__e   s4     .NNeoo5.JJ%++-	/r   c                 `    | j                   D ]  }t        |d      s|j                          ! y )Nr)   )r1   rD   r)   rI   s     r   r)   zCompositeAnalyzer.cleanj   s(    JJ 	DtW%

	r   c                 :    t        d | j                  D              S )Nc              3   4   K   | ]  }|j                     y wr   )rC   r=   s     r   r?   z.CompositeAnalyzer.has_morph.<locals>.<genexpr>p   s     8T4==8s   )anyr1   r   s    r   	has_morphzCompositeAnalyzer.has_morpho   s    8TZZ888r   NF)r   r*   r+   r8   r   r&   rH   rL   r!   r)   rR   r(   r   r   r.   r.   ?   s,    L"I,/

9r   r.   Fc                 8    t               }| r|t               z  }|S )zYDeprecated, just use an IDTokenizer directly, with a LowercaseFilter if
    desired.
    )r   r   )	lowercase	tokenizers     r   
IDAnalyzerrW   u   s!    
 I 11	r   c                 R    |rt               }n
t               }| r|t               z  }|S )a]  Parses whitespace- or comma-separated tokens.

    >>> ana = KeywordAnalyzer()
    >>> [token.text for token in ana("Hello there, this is a TEST")]
    ["Hello", "there,", "this", "is", "a", "TEST"]

    :param lowercase: whether to lowercase the tokens.
    :param commas: if True, items are separated by commas rather than
        whitespace.
    )r   r   r   )rU   commasrV   s      r   KeywordAnalyzerrZ      s-     +-	+-	 11	r   c                     t        | |      S )z4Deprecated, just use a RegexTokenizer directly.
    
expressiongapsr   r\   s     r   RegexAnalyzerr_      s     Zd;;r   c                 2    t        | |      t               z  S )a  Composes a RegexTokenizer with a LowercaseFilter.

    >>> ana = SimpleAnalyzer()
    >>> [token.text for token in ana("Hello there, this is a TEST")]
    ["hello", "there", "this", "is", "a", "test"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    r\   )r   r   r\   s     r   SimpleAnalyzerra      s     Zd;o>OOOr      Nc                 `    t        | |      }|t               z  }||t        |||      z  }|S )a  Composes a RegexTokenizer with a LowercaseFilter and optional
    StopFilter.

    >>> ana = StandardAnalyzer()
    >>> [token.text for token in ana("Testing is testing and testing")]
    ["testing", "testing", "testing"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param stoplist: A list of stop words. Set this to None to disable
        the stop word filter.
    :param minsize: Words smaller than this are removed from the stream.
    :param maxsize: Words longer that this are removed from the stream.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    r\   stoplistminsizemaxsize)r   r   r   )r]   re   rf   rg   r^   retchains          r   StandardAnalyzerrj      s@    $ JT
:C/##E
Hg+24 4Lr   iP  c                 ~    t        | |      }|t               z  }	||	t        |||      z  }	|	t        |||      z  S )a  Composes a RegexTokenizer with a lower case filter, an optional stop
    filter, and a stemming filter.

    >>> ana = StemmingAnalyzer()
    >>> [token.text for token in ana("Testing is testing and testing")]
    ["test", "test", "test"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param stoplist: A list of stop words. Set this to None to disable
        the stop word filter.
    :param minsize: Words smaller than this are removed from the stream.
    :param maxsize: Words longer that this are removed from the stream.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    :param ignore: a set of words to not stem.
    :param cachesize: the maximum number of stemmed words to cache. The larger
        this number, the faster stemming will be but the more memory it will
        use. Use None for no cache, or -1 for an unbounded cache.
    r\   rd   )stemfnignore	cachesize)r   r   r   r	   )
r]   re   rf   rg   r^   rl   rm   rn   rh   ri   s
             r   StemmingAnalyzerro      sV    . JT
:C/##E
Hg+24 4:VF(13 3 3r   z\s+Tc	                 n    t        | |      t        ||||      z  t               z  t        ||      z  S )a  Composes a RegexTokenizer with an IntraWordFilter, LowercaseFilter, and
    StopFilter.

    >>> ana = FancyAnalyzer()
    >>> [token.text for token in ana("Should I call getInt or get_real?")]
    ["should", "call", "getInt", "get", "int", "get_real", "get", "real"]

    :param expression: The regular expression pattern to use to extract tokens.
    :param stoplist: A list of stop words. Set this to None to disable
        the stop word filter.
    :param minsize: Words smaller than this are removed from the stream.
    :param maxsize: Words longer that this are removed from the stream.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    r\   )
splitwords	splitnums
mergewords	mergenums)re   rf   )r   r
   r   r   )	r]   re   rf   rg   r^   rq   rr   rs   rt   s	            r   FancyAnalyzerru      sI    & jt<y)3yJJ   (G<	= r   c                     ddl m}m} t        ||      t	               z  }	 |t        |       z  }	 |t        | |      z  }|S # |$ r Y w xY w# |$ r Y |S w xY w)aa  Configures a simple analyzer for the given language, with a
    LowercaseFilter, StopFilter, and StemFilter.

    >>> ana = LanguageAnalyzer("es")
    >>> [token.text for token in ana("Por el mar corren las liebres")]
    ['mar', 'corr', 'liebr']

    The list of available languages is in `whoosh.lang.languages`.
    You can use :func:`whoosh.lang.has_stemmer` and
    :func:`whoosh.lang.has_stopwords` to check if a given language has a
    stemming function and/or stop word list available.

    :param expression: The regular expression pattern to use to extract tokens.
    :param gaps: If True, the tokenizer *splits* on the expression, rather
        than matching on the expression.
    :param cachesize: the maximum number of stemmed words to cache. The larger
        this number, the faster stemming will be but the more memory it will
        use.
    r   )	NoStemmerNoStopWordsr\   )lang)ry   rn   )whoosh.langrw   rx   r   r   r   r	   )ry   r]   r^   rn   rw   rx   ri   s          r   LanguageAnalyzerr{      s}    , 3 z= !E
--

	BB L    Ls"   A A AAAArS   )FF)z\w+(\.?\w+)*F)whoosh.analysis.acorer   r   whoosh.analysis.tokenizersr   whoosh.analysis.filtersr   r   r   whoosh.analysis.morphr	   whoosh.analysis.intrawordr
   r   r   r   r   r   whoosh.lang.porterr   r   r.   rW   rZ   r_   ra   rj   ro   ru   r{   r(   r   r   <module>r      s   8 ? 0 3 : , 5 6 > 2 5 > #
z &19 19l*< .E P !0*54 !0*5 E3@ $j!Tdd"e6 '6E$(r   