
    =wg/J                         d dl Z d dlmZ d dlmZmZ d dlmZ d dlmZ  G d de      Z	 G d d	e      Z
 G d
 de      Z G d de      Zy)    N)deque)u	text_type)xrange)Filterc                   $    e Zd ZdZddZd Zd Zy)CompoundWordFiltera  Given a set of words (or any object with a ``__contains__`` method),
    break any tokens in the stream that are composites of words in the word set
    into their individual parts.

    Given the correct set of words, this filter can break apart run-together
    words and trademarks (e.g. "turbosquid", "applescript"). It can also be
    useful for agglutinative languages such as German.

    The ``keep_compound`` argument lets you decide whether to keep the
    compound word in the token stream along with the word segments.

    >>> cwf = CompoundWordFilter(wordset, keep_compound=True)
    >>> analyzer = RegexTokenizer(r"\S+") | cwf
    >>> [t.text for t in analyzer("I do not like greeneggs and ham")
    ["I", "do", "not", "like", "greeneggs", "green", "eggs", "and", "ham"]
    >>> cwf.keep_compound = False
    >>> [t.text for t in analyzer("I do not like greeneggs and ham")
    ["I", "do", "not", "like", "green", "eggs", "and", "ham"]
    c                      || _         || _        y)a)  
        :param wordset: an object with a ``__contains__`` method, such as a
            set, containing strings to look for inside the tokens.
        :param keep_compound: if True (the default), the original compound
            token will be retained in the stream before the subwords.
        N)wordsetkeep_compound)selfr   r   s      P/var/www/horilla/myenv/lib/python3.12/site-packages/whoosh/analysis/intraword.py__init__zCompoundWordFilter.__init__9   s     *    c                     || j                   v r|gS ||v r||   S t        dt        |            D ]>  }|d | }|| j                   v s||d  }| j                  ||      }|s1|g|z   }|||<   |c S  y N   )r   r   lensubwords)r   smemoiprefixsuffixsuffix_subsresults           r   r   zCompoundWordFilter.subwordsD   s    3J97N3q6" 	"ArUF%12"mmFD9$X3F$DG!M	" r   c              #      K   | j                   }i }| j                  }|D ]B  } ||j                  |      }|r't        |      dkD  r|r| |D ]  }||_        |  ?| D y wr   )r   r   textr   )r   tokensr   r   r   tsubssubwords           r   __call__zCompoundWordFilter.__call__V   su     **== 		AAFFD)Dt9q=]G# G$AFG 		s   A#A%N)T)__name__
__module____qualname____doc__r   r   r#    r   r   r	   r	   $   s    (	+$r   r	   c                       e Zd ZdZddZd Zy)BiWordFiltera  Merges adjacent tokens into "bi-word" tokens, so that for example::

        "the", "sign", "of", "four"

    becomes::

        "the-sign", "sign-of", "of-four"

    This can be used to create fields for pseudo-phrase searching, where if
    all the terms match the document probably contains the phrase, but the
    searching is faster than actually doing a phrase search on individual word
    terms.

    The ``BiWordFilter`` is much faster than using the otherwise equivalent
    ``ShingleFilter(2)``.
    c                     || _         y N)sep)r   r-   s     r   r   zBiWordFilter.__init__x   s	    r   c              #   L  K   | j                   }d }d }d }d}|D ]  }|j                  }|j                  }	|	r|j                  }
|j                  }|r|j
                  }|1|	r||_        |r||_        dj                  |||f      |_        | d}|}|r}|	s
} |s y y w)NF T)r-   r   	positionsposchars	startcharjoin)r   r   r-   	prev_textprev_startcharprev_pos
atleastonetokenr   r0   psr2   scs                r   r#   zBiWordFilter.__call__{   s     hh	
  	E::D IYY KKE__$ (EI&4EO  WWid%;<
!
 I!#A 	H K s   BB$B$N)-r$   r%   r&   r'   r   r#   r(   r   r   r*   r*   f   s    ",r   r*   c                       e Zd ZdZddZd Zy)ShingleFiltera  Merges a certain number of adjacent tokens into multi-word tokens, so
    that for example::

        "better", "a", "witty", "fool", "than", "a", "foolish", "wit"

    with ``ShingleFilter(3, ' ')`` becomes::

        'better a witty', 'a witty fool', 'witty fool than', 'fool than a',
        'than a foolish', 'a foolish wit'

    This can be used to create fields for pseudo-phrase searching, where if
    all the terms match the document probably contains the phrase, but the
    searching is faster than actually doing a phrase search on individual word
    terms.

    If you're using two-word shingles, you should use the functionally
    equivalent ``BiWordFilter`` instead because it's faster than
    ``ShingleFilter``.
    c                      || _         || _        y r,   )sizer-   )r   rA   r-   s      r   r   zShingleFilter.__init__   s    	r   c              #   <  K   | j                   }| j                  t               d}fd}|D ]X  }|j                  rj	                  |j                                t              |k(  s>d} |        j                          Z |sr
 |        y y y w)NFc                      d   } j                  D cg c]  }|j                   c}      | _        | j                  rd   j                  | _        | S c c}w )Nr   )r4   r   r2   endchar)tkr    bufr-   s     r   
make_tokenz*ShingleFilter.__call__.<locals>.make_token   sK    QBhh4145BGxx W__
I  5s   AT)rA   r-   r   stoppedappendcopyr   popleft)r   r   rA   r8   rH   r9   rG   r-   s         @@r   r#   zShingleFilter.__call__   s     yyhhg
	  	"E==

5::<(s8t#!%J$,&KKM	" c, "zs   <B-B/-BN)   r<   r=   r(   r   r   r?   r?      s    (r   r?   c                   d    e Zd ZdZdZ eeeeee      Z e	d      ddddfdZ
d Zd Zd	 Zd
 Zy)IntraWordFiltera{	  Splits words into subwords and performs optional transformations on
    subword groups. This filter is funtionally based on yonik's
    WordDelimiterFilter in Solr, but shares no code with it.

    * Split on intra-word delimiters, e.g. `Wi-Fi` -> `Wi`, `Fi`.
    * When splitwords=True, split on case transitions,
      e.g. `PowerShot` -> `Power`, `Shot`.
    * When splitnums=True, split on letter-number transitions,
      e.g. `SD500` -> `SD`, `500`.
    * Leading and trailing delimiter characters are ignored.
    * Trailing possesive "'s" removed from subwords,
      e.g. `O'Neil's` -> `O`, `Neil`.

    The mergewords and mergenums arguments turn on merging of subwords.

    When the merge arguments are false, subwords are not merged.

    * `PowerShot` -> `0`:`Power`, `1`:`Shot` (where `0` and `1` are token
      positions).

    When one or both of the merge arguments are true, consecutive runs of
    alphabetic and/or numeric subwords are merged into an additional token with
    the same position as the last sub-word.

    * `PowerShot` -> `0`:`Power`, `1`:`Shot`, `1`:`PowerShot`
    * `A's+B's&C's` -> `0`:`A`, `1`:`B`, `2`:`C`, `2`:`ABC`
    * `Super-Duper-XL500-42-AutoCoder!` -> `0`:`Super`, `1`:`Duper`, `2`:`XL`,
      `2`:`SuperDuperXL`,
      `3`:`500`, `4`:`42`, `4`:`50042`, `5`:`Auto`, `6`:`Coder`,
      `6`:`AutoCoder`

    When using this filter you should use a tokenizer that only splits on
    whitespace, so the tokenizer does not remove intra-word delimiters before
    this filter can see them, and put this filter before any use of
    LowercaseFilter.

    >>> rt = RegexTokenizer(r"\S+")
    >>> iwf = IntraWordFilter()
    >>> lcf = LowercaseFilter()
    >>> analyzer = rt | iwf | lcf

    One use for this filter is to help match different written representations
    of a concept. For example, if the source text contained `wi-fi`, you
    probably want `wifi`, `WiFi`, `wi-fi`, etc. to match. One way of doing this
    is to specify mergewords=True and/or mergenums=True in the analyzer used
    for indexing, and mergewords=False / mergenums=False in the analyzer used
    for querying.

    >>> iwf_i = IntraWordFilter(mergewords=True, mergenums=True)
    >>> iwf_q = IntraWordFilter(mergewords=False, mergenums=False)
    >>> iwf = MultiFilter(index=iwf_i, query=iwf_q)
    >>> analyzer = RegexTokenizer(r"\S+") | iwf | LowercaseFilter()

    (See :class:`MultiFilter`.)
    T)delims
splitwords	splitnums
mergewords	mergenumsz -_'"()!@#$%^&*[]{}<>\|;:,./?`~=+Fc                 H   ddl m}m}m} t	        j
                  |      | _        t	        j                  t        d      | j                  fz  t        j                        | _
        t        d      ||| j                  fz  }	t	        j                  |	t        j                        | _        t        d      ||fz  }
t        d      |||fz  }t        d      |||fz  }|r=|r;t        d      |
||fz  }t	        j                  |t        j                        | _        np|r3t	        j                  t        |
      t        j                        | _        n;|r9t        d	      ||fz  }t	        j                  |t        j                        | _        |xs || _        || _        || _        y
)aP  
        :param delims: a string of delimiter characters.
        :param splitwords: if True, split at case transitions,
            e.g. `PowerShot` -> `Power`, `Shot`
        :param splitnums: if True, split at letter-number transitions,
            e.g. `SD500` -> `SD`, `500`
        :param mergewords: merge consecutive runs of alphabetic subwords into
            an additional token with the same position as the last subword.
        :param mergenums: merge consecutive runs of numeric subwords into an
            additional token with the same position as the last subword.
        r   )digits	lowercase	uppercasez[^%s]+z(?<=[%s%s])'[Ss](?=$|[%s])z[%s][%s]z
[%s%s][%s]z
[%s][%s%s]z
(%s|%s|%s)z(%s|%s)N)whoosh.support.unicoderV   rW   rX   reescaperP   compiler   UNICODEbetween
possessiveboundaryr   	splittingrS   rT   )r   rP   rQ   rR   rS   rT   rV   rW   rX   dispatlower2upperletter2digitdigit2lettersplitpatnumpats                  r   r   zIntraWordFilter.__init__  sR    	HGii' zz!H+">

K/0Iy48KK4A A**VRZZ8 
my)&<<)Y)GG&)Y)GG)+|*6*8 8HJJx<DMJJy'=rzzJDMy\\<$@@FJJvrzz:DM#0y$"r   c                 p    |xr3 | j                   |j                   u xr | j                  |j                  k(  S r,   )	__class____dict__)r   others     r   __eq__zIntraWordFilter.__eq__I  s3     ,5??: ,MMU^^+	,r   c              #     K   | j                   }| j                  }d|v rg }d}| j                  j                  |      D ]3  }|j	                  ||j                         f       |j                         }5 |t        |      k  r+|j	                  |t        |      f       ndt        |      ff}|D ]  \  }}| j                  j                  |||      D ]l  }	|	j                         }
|	j                         }|rB|
}|j                  ||
|      D ]  }|j                         dz   }||f |} ||k  s`||f g|
|f n  y w)N'r   r   )	r`   ra   r_   finditerrJ   startendr   r^   )r   stringboundra   disposprevmatchr;   ec
part_match
part_startpart_endbmatchpivots                 r   _splitzIntraWordFilter._splitM  sX     NN	 &=FD11&9 #tU[[]34yy{# c&k!tS[12 #f+&(F  	1FB"ll33FBC 1
'--/
%>>+%D"'..X"N % & 2#Um+$% h#X.. &x00+1	1s   D/E2Ec                    | j                   }| j                  }d}dg }fd}t              D ]  }|d   }|d   }	|j                         rd}
n|j	                         rd}
nd }
|r|
|cxk(  rdk(  rn n|s|
|cxk(  rdk(  rn n|r|j                  |       n%t        |      dkD  r |||	dz
         dz  |g}|
}dz   t        |      dkD  r ||t              	       y y )Nr   c                     dj                  d | D              }| d   d   }| d   d   }j                  ||||f       y )Nr/   c              3   &   K   | ]	  }|d      yw)r   Nr(   ).0items     r   	<genexpr>z>IntraWordFilter._merge.<locals>.insert_item.<locals>.<genexpr>  s     6$d1g6s   r   rM   rD      )r4   insert)rG   atnewposnewtextnewscnewecinsertatpartss         r   insert_itemz+IntraWordFilter._merge.<locals>.insert_item  sH    gg6#66GF1IEGAJELLGVUE#BCr   r   rM   )rS   rT   listisalphaisdigitrJ   r   )r   r   rS   rT   lastrG   r   r   r   r1   thisr   s    `         @r   _mergezIntraWordFilter._merge  s   __
NN	 	D K 	D7Dq'C ||~ **zD%A%) 

4  s8a<  XsQw7MHfMH;	B s8a<SZ- r   c           
   #     K   | j                   }| j                  }d }|D ]D  }|j                  }||j                  r|j                  }nd}|j                         r |j                         s |j                         s|j                         r||_        | |dz  }~| j                  |      }t        |      D 	
cg c]  \  }\  }	}
||	|
 ||z   |	|
f }}	}}
t        |      dkD  r|s|r| j                  |       |j                  }|r|j                  }|D ]9  \  }}}}||_        ||_        |j                  r|z   |_        ||z   |_        | ; |s:|d   d   dz   }G y c c}
}	}w w)Nr   r   rD   )rS   rT   r   r0   r1   r   islowerisupperr   r}   	enumerater   r   r2   r3   rE   )r   r   rS   rT   r   r    r   rangesr   r;   rw   r   r2   baser1   r3   rE   s                    r   r#   zIntraWordFilter.__call__  s}    __
NN	  0	.A66D ~;;UUF FT\\^t||~<<> !
 T*,5f,=? ?(QR r"+q6z2r: ? ? u:> "YE* ;;D5: 1D#y'!AFAEww&*Y&6$(7N	G "2Yq\A-Fa0	.0?s   B;E/=E(
BE/E/N)r$   r%   r&   r'   is_morphdictr   bool__inittypes__r   r   rl   r}   r   r#   r(   r   r   rO   rO      sU    6p H	dd$(D:M   CD D!U*#X,11f5.n7.r   rO   )rZ   collectionsr   whoosh.compatr   r   r   whoosh.analysis.filtersr   r	   r*   r?   rO   r(   r   r   <module>r      sM   8 
  &   *? ?DA6 AH2F 2jO.f O.r   