
    =wgT"                         d dl mZ d dl mZ d dlmZ d dlmZmZ d dlm	Z	m
Z
  G d de	      Z G d d	e      ZddZddZy
)    )	text_type)xrange)Token)FilterLowercaseFilter)	TokenizerRegexTokenizerc                   >    e Zd ZdZ eee      ZddZd Z	 	 ddZ	y)	NgramTokenizera7  Splits input text into N-grams instead of words.

    >>> ngt = NgramTokenizer(4)
    >>> [token.text for token in ngt("hi there")]
    ["hi t", "i th", " the", "ther", "here"]

    Note that this tokenizer does NOT use a regular expression to extract
    words, so the grams emitted by it will contain whitespace, punctuation,
    etc. You may want to massage the input or add a custom filter to this
    tokenizer's output.

    Alternatively, if you only want sub-word grams without whitespace, you
    could combine a RegexTokenizer with NgramFilter instead.
    minsizemaxsizeNc                 (    || _         |xs || _        y)z
        :param minsize: The minimum size of the N-grams.
        :param maxsize: The maximum size of the N-grams. If you omit
            this parameter, maxsize == minsize.
        N)minmax)selfr   r   s      M/var/www/horilla/myenv/lib/python3.12/site-packages/whoosh/analysis/ngrams.py__init__zNgramTokenizer.__init__7   s     %g    c                     | j                   |j                   u r3| j                  |j                  k(  r| j                  |j                  k(  ryy)NTF	__class__r   r   r   others     r   __eq__zNgramTokenizer.__eq__A   s7    >>U__,xx599$UYY)>r   c	              +     K   t        |t              s
J d|z         t        |      }
t        ||||      }|}|dk(  rt	        | j
                  |
      }t        d|
|z
  dz         D ]Y  }||z   }||
kD  r||| |_        |r|j                  |_        d|_	        |r||_
        |r||z   |_        ||z   |_        | |dz  }[ y t        d|
| j                  z
  dz         D ]  }t        | j                  | j
                  dz         D ]T  }||z   }||
kD  r||| |_        |r|j                  |_        d|_	        |r||_
        |r||z   |_        ||z   |_        | V |dz  } y w)Nz%r is not unicode)removestopsmodequeryr      F)
isinstancer   lenr   r   r   r   textoriginalstoppedpos	startcharendchar)r   value	positionscharskeeporiginalr   	start_pos
start_charr   kwargsinlentr&   sizestartends                   r   __call__zNgramTokenizer.__call__G   s     %+H-@5-HH+E
)U$G7?txx'D54<!#34 dl;uS)!"AJ!	AE",u"4AK *S 0AIq   5488#3a#78 "488TXX\: D$,CU{ "5-AF#%&VV
 %AI  #&05&8$.$4	G q!s   E"E$N)FFFTr   r    
__name__
__module____qualname____doc__dictint__inittypes__r   r   r5    r   r   r   r   %   s.     c2M& JOCE+r   r   c                   8    e Zd ZdZ eee      ZddZd Zd Z	y)NgramFilterzSplits token text into N-grams.

    >>> rext = RegexTokenizer()
    >>> stream = rext("hello there")
    >>> ngf = NgramFilter(4)
    >>> [token.text for token in ngf(stream)]
    ["hell", "ello", "ther", "here"]
    r   Nc                 j    || _         |xs || _        d| _        |dk(  rd| _        y|dk(  rd| _        yy)a  
        :param minsize: The minimum size of the N-grams.
        :param maxsize: The maximum size of the N-grams. If you omit this
            parameter, maxsize == minsize.
        :param at: If 'start', only take N-grams from the start of each word.
            if 'end', only take N-grams from the end of each word. Otherwise,
            take all N-grams from the word (the default).
        r   r3   r4   r    N)r   r   at)r   r   r   rE   s       r   r   zNgramFilter.__init__   s?     %g=DG5[DG r   c                     |xrN | j                   |j                   u xr4 | j                  |j                  k(  xr | j                  |j                  k(  S r6   r   r   s     r   r   zNgramFilter.__eq__   sH     <5??: <HH		!<&*hh%))&;	<r   c              #   8  K   t        |d      sJ | j                  }|D ]v  }|j                  }t        |      | j                  k  r)|j
                  }|r|j                  }|j                  dk(  rt	        | j                  t        |j                              }|dk(  r|d | |_        |r
|z   |_	        | |dk(  r(|d|z
  d  |_        |r|j                  |z
  |_        | t        dt        |      |z
  dz         D ],  }||||z    |_        |r|z   |_        ||z   |z   |_	        | . |dk(  rYt	        | j                  t        |            }	t        | j                  |	dz         D ]  }|d | |_        |r
|z   |_	        |  r|dk(  rv|r|j                  }
t        dt        |      | j                  z
        }t        |t        |      | j                  z
  dz         D ]  }||d  |_        |r

|z   |_        |  t        dt        |      | j                  z
  dz         D ]b  }t        | j                  | j                  dz         D ]:  }||z   }|t        |      kD  r||| |_        |r|z   |_        ||z   |_	        | < d y y w)N__iter__r   rD   r    r   )hasattrrE   r#   r"   r   r+   r'   r   r   r(   r   )r   tokensrE   r1   r#   r+   r'   r2   r3   limitoriginal_startcharir4   s                r   r5   zNgramFilter.__call__   s    vz***WW >	$A66D4y488#GGEKK	
 vv 488S[18!%4[AF$-$4	G1W!!d()_AF&'ii$&6G!'3t9t+;a+?!@  !%eEDL!9 *3e*;AK(1E(9D(@AI  8#d)4E &txx ;  !%et (1D(8AI	  1W-.[[*3t9txx#78E#E3t9txx+?!+CD  !%ab *<q*@AK	  "(3t9txx+?!+C!D $$*488TXX\$B $D"'$,C"SY (%)%_AF$.7%.?,5O	"#G$$e>	$s   JJ)NNr8   r@   r   r   rB   rB   w   s%     c2M$<A$r   rB   Nc                 2    t        | |      t               z  S )zComposes an NgramTokenizer and a LowercaseFilter.

    >>> ana = NgramAnalyzer(4)
    >>> [token.text for token in ana("hi there")]
    ["hi t", "i th", " the", "ther", "here"]
    )r   )r   r   r   s     r   NgramAnalyzerrO      s     '73o6GGGr   c                 R    |s
t               }|t               z  t        | ||      z  S )N)rE   )r	   r   rB   )r   r   	tokenizerrE   s       r   NgramWordAnalyzerrR      s)    "$	((;wB+OOOr   r6   )NNN)whoosh.compatr   r   whoosh.analysis.acorer   whoosh.analysis.filtersr   r   whoosh.analysis.tokenizersr   r	   r   rB   rO   rR   r@   r   r   <module>rW      s@   8 $   ' ; @
MY Mdc$& c$PHPr   