
    =wg1                         d dl mZmZ d dlmZmZ d dlmZ  ed      Z G d de      Z	 G d de	      Z
 G d	 d
e	      Z G d de	      Zd Zd Z G d de	      Zy)    )u	text_type)
ComposableToken)rcompilez\w+(\.?\w+)*c                       e Zd ZdZd Zy)	TokenizerzBase class for Tokenizers.
    c                 :    |xr | j                   |j                   u S N)	__class__selfothers     Q/var/www/horilla/myenv/lib/python3.12/site-packages/whoosh/analysis/tokenizers.py__eq__zTokenizer.__eq__+   s    :5??::    N)__name__
__module____qualname____doc__r    r   r   r	   r	   '   s    ;r   r	   c                       e Zd ZdZ	 	 	 ddZy)IDTokenizerzYields the entire input string as a single token. For use in indexed but
    untokenized fields, such as a document's path.

    >>> idt = IDTokenizer()
    >>> [token.text for token in idt("/a/b 123 alpha")]
    ["/a/b 123 alpha"]
    c	              +      K   t        |t              s
J d|z         t        ||f||d|	}
||
_        d|
_        |r||
_        |r
|dz   |
_        |r||
_        |t        |      z   |
_	        |
 y w)N%r is not unicoderemovestopsmode      ?   )

isinstancer   r   textboostoriginalpos	startcharlenendchar)r   value	positionscharskeeporiginalr   	start_pos
start_charr   kwargsts              r   __call__zIDTokenizer.__call__8   s      %+H-@5-HH+)U $ AJMAE$AK"SZ/AIs   A/A1N)FFFTr   r    )r   r   r   r   r1   r   r   r   r   r   /   s     6;1513r   r   c                   0    e Zd ZdZedfdZd Z	 	 	 ddZy)RegexTokenizerz
    Uses a regular expression to extract tokens from text.

    >>> rex = RegexTokenizer()
    >>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))]
    ["hi", "there", "3.141", "big", "time", "under_score"]
    Fc                 2    t        |      | _        || _        y)a  
        :param expression: A regular expression object or string. Each match
            of the expression equals a token. Group 0 (the entire matched text)
            is used as the text of the token. If you require more complicated
            handling of the expression match, simply write your own tokenizer.
        :param gaps: If True, the tokenizer *splits* on the expression, rather
            than matching on the expression.
        N)r   
expressiongaps)r   r6   r7   s      r   __init__zRegexTokenizer.__init__S   s     #:.	r   c                     | j                   |j                   u r.| j                  j                  |j                  j                  k(  ryy)NTF)r   r6   patternr   s     r   r   zRegexTokenizer.__eq__`   s6    >>U__,&&%*:*:*B*BBr   c
              +   R  K   t        |t              sJ dt        |      z         t        ||f||	d|
}|s?|x|_        |_        d|_        |r||_        |r||_        |t        |      z   |_
        | y| j                  st        | j                  j                  |            D ]~  \  }}|j                  d      |_        d|_        |r|j
                  |_        d|_        |r
||z   |_        |r0||j#                         z   |_        ||j%                         z   |_
        |  yd}|}| j                  j                  |      D ]{  }|}|j#                         }||| }|rP||_        d|_        |r|j
                  |_        d|_        |r||_        |dz  }|r||z   |_        ||z   |_
        | |j%                         }} |t        |      k  rR||d |_        d|_        |r|j
                  |_        d|_        |r||_        |r||_        t        |      |_
        | yyw)  
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        z%s is not unicoder   r   r   Fr    N)r!   r   reprr   r$   r"   r#   r%   r&   r'   r(   r7   	enumerater6   finditergroupstoppedstartend)r   r)   r*   r+   r,   r   r-   r.   tokenizer   r/   r0   r%   matchprevendrB   rC   r"   s                     r   r1   zRegexTokenizer.__call__f   s$      %+N-@4;-NN+)U $ "''AJAG!(&U3	G'(@(@(GH 
UQ!"AJ!	%OAE",u{{}"<AK *UYY[ 8AI GC11%8 &kkmU3'!AF!AG#%&VV
 %AI  #q&05&8$.$4	G))+'&. U#wx!"AJ!	AE")AK #E
AI $s   H%H'NFFFTr   r   Tr2   )r   r   r   r   default_patternr8   r   r1   r   r   r   r4   r4   J   s+     #2  JOGKRr   r4   c                   <    e Zd ZdZ ee      Zd Zd Z	 	 	 ddZ	y)CharsetTokenizera  Tokenizes and translates text according to a character mapping object.
    Characters that map to None are considered token break characters. For all
    other characters the map is used to translate the character. This is useful
    for case and accent folding.

    This tokenizer loops character-by-character and so will likely be much
    slower than :class:`RegexTokenizer`.

    One way to get a character mapping object is to convert a Sphinx charset
    table file using :func:`whoosh.support.charset.charset_table_to_dict`.

    >>> from whoosh.support.charset import charset_table_to_dict
    >>> from whoosh.support.charset import default_charset
    >>> charmap = charset_table_to_dict(default_charset)
    >>> chtokenizer = CharsetTokenizer(charmap)
    >>> [t.text for t in chtokenizer(u'Stra\xdfe ABC')]
    [u'strase', u'abc']

    The Sphinx charset table format is described at
    http://www.sphinxsearch.com/docs/current.html#conf-charset-table.
    charmapc                     || _         y)z
        :param charmap: a mapping from integer character numbers to unicode
            characters, as used by the unicode.translate() method.
        NrK   )r   rL   s     r   r8   zCharsetTokenizer.__init__   s    
 r   c                 p    |xr3 | j                   |j                   u xr | j                  |j                  k(  S r   )r   rL   r   s     r   r   zCharsetTokenizer.__eq__   s4     2NNeoo52LLEMM1	3r   c
              +     K   t        |t              s
J d|z         t        ||f||	d|
}|s?|x|_        |_        d|_        |r||_        |r||_        |t        |      z   |_	        | yt        d      }| j                  }|}|x}}|D ]u  }|t        |         }|r||z  }nX||kD  rC||_        d|_        |r|j                  |_        |r||_        |dz  }|r||_        ||_	        | |dz   }t        d      }|dz  }w ||kD  rB||| |_        d|_        |r|j                  |_        |r||_        |r||_        ||_	        | yyw)r<   r   r   r   r2   r    N)r!   r   r   r$   r"   r#   r%   r&   r'   r(   r   rL   ord)r   r)   r*   r+   r,   r   r-   r.   rD   r   r/   r0   r"   rL   r%   r&   currentcharchartchars                      r   r1   zCharsetTokenizer.__call__   s      %+H-@5-HH+)U $ "''AJAG!(&U3	GR5DllGC&00I !D	*EMD"Y.!%"%')*AJ$$'AE1HC *3AK(3AI +aIR5Dq )!, Y&y5!"AJAE"+AK +AI 's   EENrG   )
r   r   r   r   dictstr__inittype__r8   r   r1   r   r   r   rJ   rJ      s0    , $L3
 JOGKBr   rJ   c                      t        d      S )zReturns a RegexTokenizer that splits tokens by whitespace.

    >>> sst = SpaceSeparatedTokenizer()
    >>> [token.text for token in sst("hi there big-time, what's up")]
    ["hi", "there", "big-time,", "what's", "up"]
    z[^ \t\r\n]+)r4   r   r   r   SpaceSeparatedTokenizerrX   %  s     .))r   c                  4    ddl m}  t        d       |        z  S )a  Splits tokens by commas.

    Note that the tokenizer calls unicode.strip() on each match of the regular
    expression.

    >>> cst = CommaSeparatedTokenizer()
    >>> [token.text for token in cst("hi there, what's , up")]
    ["hi there", "what's", "up"]
    r   StripFilterz[^,]+)whoosh.analysis.filtersr[   r4   rZ   s    r   CommaSeparatedTokenizerr]   0  s     4(#km33r   c                        e Zd ZdZddZddZy)PathTokenizerzhA simple tokenizer that given a string ``"/a/b/c"`` yields tokens
    ``["/a", "/a/b", "/a/b/c"]``.
    c                 $    t        |      | _        y r   )r   expr)r   r6   s     r   r8   zPathTokenizer.__init__E  s    Z(	r   c              +      K   t        |t              s
J d|z         t        |fi |}|}| j                  j	                  |      D ],  }|d |j                          |_        |r||_        |dz  }| . y w)Nr   r    )r!   r   r   ra   r?   rC   r"   r%   )r   r)   r*   r-   r/   tokenr%   rE   s           r   r1   zPathTokenizer.__call__H  s     5),	I.AE.I	I,y+F+ii((/ 
U-UZ [
s   A4A6N)z[^/]+)Fr   )r   r   r   r   r8   r1   r   r   r   r_   r_   @  s    )	r   r_   N)whoosh.compatr   r   whoosh.analysis.acorer   r   whoosh.util.textr   rH   r	   r   r4   rJ   rX   r]   r_   r   r   r   <module>rg      sh   8 ' 3 % ?+;
 ;) 6nY nbgy gT*4 I r   