
    =wg-                         d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
  G d de      Z G d	 d
e      Z G d de      Z G d de      Z G d de      ZddZddZd ZddZddZd Zd Zd Zy)zQClasses and functions for classifying and extracting information from
documents.
    )divisionN)defaultdict)log)xrange	iteritemsc                       e Zd Zd Zd Zd Zy)ExpansionModelc                     || _         || _        | j                   r| j                  | j                   z  | _        y d| _        y )Nr   )Ncollection_totalmean_length)self	doc_countfield_lengths      F/var/www/horilla/myenv/lib/python3.12/site-packages/whoosh/classify.py__init__zExpansionModel.__init__+   s7     ,66#44tvv=D D    c                     t         NNotImplementedErrorr   	maxweight	top_totals      r   
normalizerzExpansionModel.normalizer4       !!r   c                     t         r   r   )r   weight_in_topweight_in_collectionr   s       r   scorezExpansionModel.score7   r   r   N)__name__
__module____qualname__r   r   r     r   r   r	   r	   *   s    !""r   r	   c                       e Zd Zd Zd Zy)Bo1Modelc                 ~    || j                   z  }|t        d|z   |z        z  t        d|z         z   t        d      z  S )N      ?       @r   r   r   r   r   fs       r   r   zBo1Model.normalizer<   s>    CqA..S1W=SIIr   c                 j    || j                   z  }|t        d|z   |z  d      z  t        d|z   d      z   S Nr(      r*   r   r   r   r   r,   s        r   r    zBo1Model.score@   s9     466)sC!Gq=!44s37AFFr   Nr!   r"   r#   r   r    r$   r   r   r&   r&   ;   s    JGr   r&   c                       e Zd Zd Zd Zy)Bo2Modelc                     || j                   z  | j                  z  }|t        d|z   |z  d      z  t        d|z   d      z   S r.   )r   r   r   r+   s       r   r   zBo2Model.normalizerF   sD    !6!663a1}a003sQw?BBr   c                 p    ||z  | j                   z  }|t        d|z   |z  d      z  t        d|z   d      z   S r.   r   r   r0   s        r   r    zBo2Model.scoreJ   s@    I%(=(==sC!Gq=!44s37AFFr   Nr1   r$   r   r   r3   r3   E   s    CGr   r3   c                       e Zd Zd Zd Zy)KLModelc                 V    |t        | j                  |z        z  t        d      z  |z  S )Nr)   )r   r   r   s      r   r   zKLModel.normalizerP   s0    C 5 5	 ABBSXM 	r   c                 t    ||z  }|| j                   z  }||k  ry|t        ||| j                   z  z  d      z  S )Nr   r/   r6   )r   r   r   r   wit_over_ttwic_over_cts         r   r    zKLModel.scoreT   sT    #i/*T-B-BB$[(58M8M(M&O%&"( ( (r   Nr1   r$   r   r   r8   r8   O   s    	(r   r8   c                   4    e Zd ZdZefdZd Zd Zd ZddZ	y)	ExpanderzeUses an ExpansionModel to expand the set of query terms based on the top
    N result documents.
    c                    || _         || _        | j                   j                         }| j                   j                  |      }t	        |      t        u r	 |||      }|| _        t        t              | _        d| _	        y)aT  
        :param reader: A :class:whoosh.reading.IndexReader object.
        :param fieldname: The name of the field in which to search.
        :param model: (classify.ExpansionModel) The model to use for expanding
            the query terms. If you omit this parameter, the expander uses
            :class:`Bo1Model` by default.
        r   N)
ixreader	fieldnamedoc_count_allr   typemodelr   floattopN_weightr   )r   r@   rA   rD   doccountfieldlens         r   r   zExpander.__init__e   so     !"MM//1==--i8;$(H-E
 'u- r   c                     d}| j                   }|D ]  \  }}||z  }||xx   |z  cc<    | xj                  |z  c_        y)zAdds forward-index information about one of the "top N" documents.

        :param vector: A series of (text, weight) tuples, such as is
            returned by Reader.vector_as("weight", docnum, fieldname).
        r   N)rF   r   )r   vectortotal_weightrF   wordweights         r   addzExpander.add}   sQ     &&" 	(LD&F"L'	( 	,&r   c                    | j                   }| j                   j                  || j                        r-| j                  |j	                  d|| j                               y | j                   j
                  | j                     j                  r:| j                  |j                  |      j                  | j                               y t        d| j                  d|d      )NrM   zField z in document z is not vectored or stored)r@   
has_vectorrA   rN   	vector_asschemastoredadd_textstored_fieldsget	Exception)r   docnumr@   s      r   add_documentzExpander.add_document   s    ====##FDNN;HHX''&$..IJ]]!!$..188MM(008<<T^^LM#~~v7 8 8r   c                     | j                   j                  | j                     }|j                  | j	                  fd|j                  |      D               y )Nc              3   <   K   | ]  \  }}}} |      |f  y wr   r$   ).0text_rM   
from_bytess       r   	<genexpr>z$Expander.add_text.<locals>.<genexpr>   s(      )0Ba*T"F+ )s   )r@   rR   rA   r_   rN   index)r   stringfieldr_   s      @r   rT   zExpander.add_text   sJ     $$T^^4%%
 )KK') 	)r   c                 n   | j                   }| j                  }| j                  }|j                  |   }g }d}| j                  sg S t        | j                        D ]f  \  }	}
|j                  |	      }||f|v s|j                  ||      }|j                  |
|| j                        }||kD  r|}|j                  ||	f       h |r|j                  || j                        }n|}|D 
cg c]  \  }
}|
|z  |f }}
}|j                  d        |d| D 
cg c]	  \  }
}||
f c}}
S c c}}
w c c}}
w )zReturns the N most important terms in the vectors added so far.

        :param number: The number of terms to return.
        :param normalize: Whether to normalize the weights.
        :returns: A list of ("term", weight) tuples.
        r   c                     d| d   z
  | d   fS Nr      r$   xs    r   <lambda>z)Expander.expanded_terms.<locals>.<lambda>   s    !ad(AaD!1 r   keyN)rD   rA   r@   rR   rF   r   to_bytes	frequencyr    r   appendr   sort)r   number	normalizerD   rA   r@   rc   tlistr   rL   rM   btextcfr    normts                   r   expanded_termszExpander.expanded_terms   s=    

NN	==	*	 I%d&6&67 	,LD&NN4(E5!X-''	59FB?9$ %IeT]+	, ##It~~>DD5:;	&4-#;;

1
2-27F^<	F<< < =s   *D+D1N)T)
r!   r"   r#   __doc__r&   r   rN   rY   rT   rx   r$   r   r   r>   r>   `   s%     3; 0' 8	)#=r   r>   c                      t        t              } fdt        t               dz
  z
        D        D ]  }||xx   dz  cc<    t	        |      S )Nc              3   .   K   | ]  }||z      y wr   r$   )r\   iinputsizes     r   r`   zshingles.<locals>.<genexpr>   s$      > !AH% >s   rg   )r   intr   lenr   )r}   r~   dshingles   ``  r   shinglesr      sQ    CA>#CJ$($;<> 	'
a
 Q<r   c                    dk(  rt         }nfd}dgz  }| D ]@  \  }} ||      }t              D ]%  }|d|z  z  r||xx   |z  cc<   ||xx   |z  cc<   ' B d}t        |      D ]  \  }}	|	dkD  s|d|z  z  } |S )N    c                     t        |       S r   )_hash)shashbitss    r   rj   zsimhash.<locals>.<lambda>   s    5H- r   r   rg   )hashr   	enumerate)
featuresr   hashfnvsfeaturerM   hr|   outvs
    `        r   simhashr      s    2~-
xB#  7O! 	 AAF|11		   C" 1q516MC Jr   c                     | dk(  ryt        | d         dz  }d}d|z  dz
  }| D ]  }||z  t        |      z  |z  } |t        |       z  }|dk(  rd}|S )	N r      iCB r/   rg   )ordr   )r   r   ri   mmaskcs         r   r   r      st    Bw!INH}q  	*Aa%3q6!T)A	*	SV7Ar   c                 J    | |z  d|z  dz
  z  }d}|r|dz  }||dz
  z  }|r|S )Nrg   r   r$   )
first_hash
other_hashr   ri   tots        r   hamming_distancer      sD    	j	 a8mq%89A
C
q	QU
  Jr   c                    d}t        |       }|}dg|z  }	dg|z  }
dg|z  }|r|}nt        j                  | |      }d}	 |}d}t        |      D ]  }d|	|<   d||<    t        |      D ]W  }|}t        |      D ]  }| |   ||   z
  dz  }||k  s||
|<   |} ||
|   xx   | |   z  cc<   |	|
|   xx   dz  cc<   ||z  }Y t        |      D ]  }|	|   r||   |	|   z  n||   ||<    |dz  }t	        ||z
        |k  s||kD  r	 |
|fS )ay  
    One-dimensional K-means clustering function.

    :param data: list of data points.
    :param k: number of clusters.
    :param t: tolerance; stop if changes between iterations are smaller than
        this value.
    :param distfun: a distance function.
    :param centers: a list of centroids to start with.
    :param maxiter: maximum number of iterations to run.
    g`r   r/   rg   )r   randomsampler   abs)datakrw   distfunmaxitercenters
DOUBLE_MAXnerrorcountslabelsc1r   niter	old_errorr|   r   min_distancedistances                      r   kmeansr     s     JD	AES1WFS1WF qB MM$"E
	  	AF1IBqE	  	"A%LAY , GadNq0l* !F1I#+L	, vayMT!W$M6!9"\!E	"  	=A(.q	2a56!9$r!uAaD	= 	
	!"Q&EGO19A r   c                 x    d}d}d}| D ]  }|dz  }||z   } ||z  }| D ]  }|||z
  ||z
  z  z  } ||dz
  z  }|S rf   r$   )r   r   sum1sum2ri   meanvariances          r   two_pass_variancer   K  sw    	ADD 	Qax !8D (Ta$h''( q1u~HOr   c                 ~    d}d}d}| D ])  \  }}||z   }||z
  }||z  |z  }|||z  |z  z  }||z  }|}+ ||dz
  z  }	|	S rf   r$   )
data_weight_pairsr   S	sumweightri   rM   tempQRVariances
             r   weighted_incremental_variancer   ]  s{    D	AI& 	6	!HJ	Y]Q		 IM"HOr   c                 D   g }t        |       D ]|  \  }}|}| |   }|t        |       dz
  k  r,||z
  |k  r$|dz  }| |   }|t        |       dz
  k  r	||z
  |k  r$d}||z
  dkD  rt        | ||dz          }|j                  ||||z
  |f       ~ |j	                  d        |S )Nrg   i c                     d| d   z
  | d   fS )Nr   r/      r$   rh   s    r   rj   zswin.<locals>.<lambda>x  s    QqT1Q4 0 r   rk   )r   r   r   ro   rp   )r   r~   clustersr|   leftjrightr   s           r   swinr   l  s    HT? 	14Q#d)a-EDL4$7FAGE #d)a-EDL4$7 q519!$qQ-0Aua!eQ/0	1 MM0M1Or   )r/   )r   )g-C6?N2   N)ry   
__future__r   r   collectionsr   mathr   whoosh.compatr   r   objectr	   r&   r3   r8   r>   r   r   r   r   r   r   r   r   r$   r   r   <module>r      s   8    #  +
"V ""G~ GG~ G(n ("e=v e=T, CP$r   