a
    ^g1                     @   s   d Z ddlZddlZddlZddlmZ zddlZW n e	yN   e
d Y n0 dd Zd"d	d
Zdd ZG dd dZG dd deZd#ddZd$ddZG dd deZdd Zdd Zd%ddZd&d d!ZdS )'zO
This contrib module contains a few routines useful to do clustering variants.
    N)
ThreadPoolz2scipy not accessible, Python k-means will not workc                  O   s   d S N )argkwargsr   r   f/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/faiss/contrib/clustering.py	print_nop   s    r   T   c                 K   st  | j d }|dd}|rtnt}|d| j  d| d|  |d tj||f|dd	|}	|	|  |	jg}
|  |	j}|d
 t		 }|	
| \}}tj||d}|dt		 | ddt| dt|  | }~	|st|d | | }|dd |dd  }ndt|}|| |d  }|dd  |dd 8  < t||ks^J |dt| dt|  d}g }t		 }t|D ]}t|| }|dt		 | dd| d| d| d	ddd |||  }||| }t|| |ksJ tj||fi |}	| | }|	| |
|	j ||	j ~	|}q|dt		 | dd t||
fS )a=  
    perform 2-level clustering on a training set xt
    nc1 and nc2 are the number of clusters at each level, the final number of
    clusters is nc2. Additional arguments are passed to the Kmeans object.

    Rebalance allocates the number of sub-clusters depending on the number of
    first-level assignment.
       verboseFz2-level clustering of z nb 1st level clusters = z total zperform coarse trainingi  )niterZmax_points_per_centroidzassigning the training setZ	minlengthzdone in z.2fz s. Sizes of clusters -Nznb 2nd-level centroids r   [z s] training sub-cluster /z nc2= Tendflushz s)shapegetprintr   faissZKmeanstrainiteration_stats	centroidstimeassignnpbincountminmaxZargsortarangeZcumsumsumrangeintallappendZvstack)xtnc1Znc2Z	rebalanceZclustering_niterargsdr   logkmr   Z
centroids1t0_Zassign1bcoccZall_nc2Zbc_sumZi0c2c1i1ZsubsetZxtsubr   r   r   two_level_clustering   sd    	

,
2
r8   c                 K   s   t | } t| t jrht| j D ]$}| j|}|| |	|}q$t
| j|fi | d| _dS t| t jsxJ | jt jksJ tt| j}td| t||| jfi |\}}| j| | j| | | dS )zJ
    Applies 2-level clustering to an index_ivf embedded in an index.
    TNz
REBALANCE=)r   Zdowncast_index
isinstanceZIndexPreTransformr&   chainsizeatr   applytrain_ivf_index_with_2levelindexZ
is_trainedZIndexIVFZmetric_typeZ	METRIC_L2r'   r    sqrtZnlistr   r8   Z	quantizeradd)r?   r*   r,   ivtr+   r   r1   r   r   r   r>   _   s"    


r>   c                   @   sB   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdddZ	dS )DatasetAssignWrapper for a matrix that offers a function to assign the vectors
    to centroids. All other implementations offer the same interfacec                 C   s   t j|dd| _d S Nfloat32Zdtype)r    ascontiguousarrayxselfrJ   r   r   r   __init__   s    zDatasetAssign.__init__c                 C   s   | j jd S )Nr   rJ   r   rL   r   r   r   count   s    zDatasetAssign.countc                 C   s   | j jd S Nr
   rN   rO   r   r   r   dim   s    zDatasetAssign.dimc                 C   s
   | j | S r   )rJ   rL   indicesr   r   r   
get_subset   s    zDatasetAssign.get_subsetc                 C   s   t | j|dS rQ   )r   ZknnrJ   rL   r   r   r   r   perform_search   s    zDatasetAssign.perform_searchNc                 C   s   |  |\}}| }| }|j\}}tj||fdd}|d u rVtj||| j n$tj|||d d tjf | j  |||fS rF   )	rW   ravelr   r    ZzerosrA   r<   rJ   Znewaxis)rL   r   weightsDIncr-   sum_per_centroidr   r   r   	assign_to   s    
$zDatasetAssign.assign_to)N)
__name__
__module____qualname____doc__rM   rP   rR   rU   rW   r^   r   r   r   r   rD      s   rD   c                   @   s"   e Zd ZdZdddZdd ZdS )	DatasetAssignGPUz GPU version of the previous Fc                 C   sJ   t | | t|jd }|dkr:tt ||| _nt|| _d S )Nr
   r   )	rD   rM   r   ZIndexFlatL2r   Zindex_cpu_to_gpuZStandardGpuResourcesr?   Zindex_cpu_to_all_gpus)rL   rJ   Zgpu_idr   r?   r   r   r   rM      s    zDatasetAssignGPU.__init__c                 C   s&   | j   | j | | j | jdS rQ   )r?   resetrA   searchrJ   rV   r   r   r   rW      s    
zDatasetAssignGPU.perform_searchN)F)r_   r`   ra   rb   rM   rW   r   r   r   r   rc      s   
rc   c           	      C   s   | j d }|j d }|du r*|d d}|du rHt| dd}|d|  |j  }|jdd}| |t||   |  }||fS )z assignment function for xq is sparse, xb is dense
    uses a matrix multiplication. The squared norms can be provided if
    available.
    r   N   r
   )Zaxis)	r   r%   r    arraypowerTZargminrX   r$   )	xqxbxq_normsxb_normsnqnbZd2r[   rZ   r   r   r   sparse_assign_to_dense   s    

"rp    @  c           
   	      s   j d }j d tj|dd  tj tj|td du rTd d f	dd}|dks|dks|krtt	|t
d| nt|}	|		|t
d|  fS )	z
    decomposes the sparse_assign_to_dense function into blocks to avoid a
    possible memory blow up. Can be run in multithreaded mode, because scipy's
    sparse-dense matrix multiplication is single-threaded.
    r   rG   rH   Nrf   r
   c           	   	      s   | |   }| |   } | |   }d u rPt |dd}n| |   }tdD ]v}t|||  |||  d\}}|dkr||d d < ||d d < ql||k }|| | ||< || ||< qld S )Nrf   r
   r   )rl   rm   )r    rg   rh   r%   r&   rp   )	rB   Zxq_blockZIblockZDblockZxq_norms_blockjZDiZIimask	rZ   r[   bbsro   qbsrk   rm   rj   rl   r   r   handle_query_block   s&    
z9sparse_assign_to_dense_blocks.<locals>.handle_query_block)r   r    emptyfillinfonesr'   r%   listmapr&   r   )
rj   rk   rl   rm   rv   ru   ntrn   rw   poolr   rt   r   sparse_assign_to_dense_blocks   s    

r   c                   @   s2   e Zd ZdZdd Zdd Zdd Zdd	d
ZdS )DatasetAssignSparserE   c                 C   s4   |j tjjksJ || _t|dd| _	d S )Nrf   r
   )
	__class__scipysparseZ
csr_matrixrJ   r    rg   rh   r%   squared_normsrK   r   r   r   rM      s    zDatasetAssignSparse.__init__c                 C   s   t | j|  S r   )r    rg   rJ   todenserS   r   r   r   rU     s    zDatasetAssignSparse.get_subsetc                 C   s   t | j|| jdS )N)rl   )r   rJ   r   rV   r   r   r   rW     s    
z"DatasetAssignSparse.perform_searchNc           	      C   s   |  |\}}| }| }| jjd }|d u r@tj|dd}t|}tjj	||t
|d f||fd}t|| j  }|||fS )Nr   rG   rH   r
   )r   )rW   rX   rJ   r   r    r{   lenr   r   Z
csc_matrixr$   rg   r   )	rL   r   rY   rZ   r[   nr\   mr]   r   r   r   r^   	  s    zDatasetAssignSparse.assign_to)N)r_   r`   ra   rb   rM   rU   rW   r^   r   r   r   r   r      s
   r   c                 C   s&   t j|dd}tt|| t|S )NZint64rH   )r    rI   r   imbalance_factorr   Zswig_ptr)kr   r   r   r   r     s    r   c                 C   s>   | j tjkrdS dd l}t| |jr(dS tdt|  d S )NFr   TzUnknown tensor type )r   r    Zndarraytorchr9   ZTensorNotImplementedErrortype)rJ   r   r   r   r   check_if_torch   s    r   c                 C   st  |du rt j}|j\}}d}t|}t | dkd }t|dkrFdS |rbddl}||d }	nt |d }	|	ddd  d7  < |	ddd  d8  < t|dkrp| dd }
d|
|
dk < |
|
	  }
|
dk	 }t
||j}|j|||
d}t|d| |D ]V\}}|| }||	 ||< ||	 ||< | | d | |< | |  | | 8  < |d7 }q
||d }q|S )z/ reassign centroids when some of them collapse Nr   rf   g      P?r
   float)r;   p)r    randomr   r   wherer   r   Z	ones_likeastyper%   r"   r;   choicezip)hassignr   rsr   r-   nsplitis_torchZempty_centsr   facZprobasZnnzZnreplaceZcjscicjcr   r   r   reassign_centroids)  s<    
r     Fc              	   C   s  |  |  }}|rtnt}	|	d||| ||f  tj|}
td t }|
j|| dd}|	|}t
|}g }|	d d}g }t|D ]L}t }|	ddd	d
 ||\}}}|	ddd	d
 |t | 7 }| }|r| }|| tj|| d}|ddd}d||dk< |rBddl}|||j}|| }t|||
}|t | |t| ||d}|	d||d |d ||d |f  || |dur|	d| |rddl}||| qt|| q|r||fS |S dS )a0  Pure python kmeans implementation. Follows the Faiss C++ version
    quite closely, but takes a DatasetAssign instead of a training data
    matrix. Also redo is not implemented.

    For the torch implementation, the centroids are tensors (possibly on GPU),
    but the indices remain numpy on CPU.
    zAClustering %d points in %dD to %d clusters, %d iterations seed %dz
preproc...F)r;   replacez  doner   Z	assigningr   Tr   zcompute centroidsr   r   r
   rG   N)objr   time_searchr   r   zM  Iteration %d (%.2f s, search %.2f s): objective=%g imbalance=%.3f nsplit=%dr   r   r   zstoring centroids in)rP   rR   r   r   r    r   ZRandomStater   r   rU   r   r&   r^   r%   itemr)   r!   Zreshaper   r   Z
from_numpytoZdevicer   r   save)r   datar   seed
checkpointr   Zreturn_statsr   r-   r.   r   r0   permr   r   r   Zt_search_totr   rB   Zt0sr   rZ   Zsumserrr   r   r   r   sr   r   r   kmeansZ  sn    	




r   )Tr	   )NN)NNrq   rq   N)N)r	   r   NTF)rb   numpyr    r   r   Zmultiprocessing.poolr   Zscipy.sparser   ImportErrorr   r   r8   r>   rD   rc   rp   r   r   r   r   r   r   r   r   r   r   <module>   s.   
G#"
 
0"	
1  