a
    `gg                     @  s   d dl mZ d dlZd dlZd dlmZmZmZmZm	Z	m
Z
mZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZmZ d dlmZmZmZmZmZ d dlm Z  e!e"Z#dd	d
ddddddZ$G dd deeZ%dS )    )annotationsN)AnyDictIterableListLiteralMappingOptionalSequenceSetTupleUnioncast)
Embeddings)from_envget_pydantic_field_namessecret_from_env)	BaseModel
ConfigDictField	SecretStrmodel_validator)SelfintzList[Union[List[int], str]]List[List[float]]z	List[int]boolzList[Optional[List[float]]])	num_textstokensbatched_embeddingsindices
skip_emptyreturnc           	        s&  dd t | D }dd t | D t t|D ]J |rJt|  dkrJq0||   |   |   t|   q0g }t | D ] |  }t|dkr|d  qqt|dkr||d  qqt   fddt| D }tdd |D d	 |fd
d|D  q|S )Nc                 S  s   g | ]}g qS  r"   .0_r"   r"   n/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_openai/embeddings/base.py
<listcomp>'       z7_process_batched_chunked_embeddings.<locals>.<listcomp>c                 S  s   g | ]}g qS r"   r"   r#   r"   r"   r&   r'   ,   r(      r   c                   s,   g | ]$}t d d t|  D  qS )c                 s  s   | ]\}}|| V  qd S Nr"   )r$   valweightr"   r"   r&   	<genexpr>K   s   zA_process_batched_chunked_embeddings.<locals>.<listcomp>.<genexpr>)sumzip)r$   	embedding)inum_tokens_in_batchtotal_weightr"   r&   r'   J   s   c                 s  s   | ]}|d  V  qdS )   Nr"   r$   r+   r"   r"   r&   r-   U   r(   z6_process_batched_chunked_embeddings.<locals>.<genexpr>g      ?c                   s   g | ]}|  qS r"   r"   r5   )	magnituder"   r&   r'   V   r(   )rangelenappendr.   r/   )	r   r   r   r   r    results
embeddings_resultZaverager"   )r1   r6   r2   r3   r&   #_process_batched_chunked_embeddings   s.    	
r=   c                   @  s  e Zd ZU dZedddZded< edddZded< dZd	ed
< dZ	ded< eZ
ded< eedddddZded< ededdddZded< eeddddZded< eeddddZded< dZded< ed ed!dddZd"ed#< ed$ed%d&gdddZded'< dZd(ed)< dZd*ed+< d,Zded-< d.Zded/< edd0d1Zd2ed3< dZded4< dZd5ed6< dZded7< d8Zd5ed9< eedZd:ed;< d8Zd5ed<< dZ d=ed>< dZ!d?ed@< dAZ"dedB< dCZ#dedD< dZ$dEedF< dZ%dEedG< dZ&d5edH< e'dIddJdKZ(e)dLdMe*d:ddNdOdPZ+e)dQdMdRdSdTdUZ,e-d:dSdVdWZ.dXddYdZd[d\Z/dd]dXd	dd^d_d`daZ0dd]dXd	dd^d_dbdcZ1dodXddd^dZdedfZ2dpdXddd^dZdgdhZ3d	didjdkdlZ4d	didjdmdnZ5dS )qOpenAIEmbeddingsu	  OpenAI embedding model integration.

    Setup:
        Install ``langchain_openai`` and set environment variable ``OPENAI_API_KEY``.

        .. code-block:: bash

            pip install -U langchain_openai
            export OPENAI_API_KEY="your-api-key"

    Key init args — embedding params:
        model: str
            Name of OpenAI model to use.
        dimensions: Optional[int] = None
            The number of dimensions the resulting output embeddings should have.
            Only supported in `text-embedding-3` and later models.

    Key init args — client params:
        api_key: Optional[SecretStr] = None
            OpenAI API key.
        organization: Optional[str] = None
            OpenAI organization ID. If not passed in will be read
            from env var OPENAI_ORG_ID.
        max_retries: int = 2
            Maximum number of retries to make when generating.
        request_timeout: Optional[Union[float, Tuple[float, float], Any]] = None
            Timeout for requests to OpenAI completion API

    See full list of supported init args and their descriptions in the params section.

    Instantiate:
        .. code-block:: python

            from langchain_openai import OpenAIEmbeddings

            embed = OpenAIEmbeddings(
                model="text-embedding-3-large"
                # With the `text-embedding-3` class
                # of models, you can specify the size
                # of the embeddings you want returned.
                # dimensions=1024
            )

    Embed single text:
        .. code-block:: python

            input_text = "The meaning of life is 42"
            vector = embeddings.embed_query("hello")
            print(vector[:3])

        .. code-block:: python

            [-0.024603435769677162, -0.007543657906353474, 0.0039630369283258915]

    Embed multiple texts:
        .. code-block:: python

            vectors = embeddings.embed_documents(["hello", "goodbye"])
            # Showing only the first 3 coordinates
            print(len(vectors))
            print(vectors[0][:3])

        .. code-block:: python

            2
            [-0.024603435769677162, -0.007543657906353474, 0.0039630369283258915]

    Async:
        .. code-block:: python

            await embed.aembed_query(input_text)
            print(vector[:3])

            # multiple:
            # await embed.aembed_documents(input_texts)

        .. code-block:: python

            [-0.009100092574954033, 0.005071679595857859, -0.0029193938244134188]
    NT)defaultexcluder   clientasync_clientztext-embedding-ada-002strmodelzOptional[int]
dimensionszOptional[str]
deploymentZOPENAI_API_VERSION)r?   api_version)default_factoryaliasopenai_api_versionbase_urlZOPENAI_API_BASE)rI   rH   openai_api_baseZOPENAI_API_TYPE)rH   openai_api_typeZOPENAI_PROXYopenai_proxyi  r   embedding_ctx_lengthapi_keyZOPENAI_API_KEYzOptional[SecretStr]openai_api_keyorganizationZOPENAI_ORG_IDZOPENAI_ORGANIZATIONopenai_organizationz%Union[Literal['all'], Set[str], None]allowed_specialz4Union[Literal['all'], Set[str], Sequence[str], None]disallowed_speciali  
chunk_sizer4   max_retriestimeout)r?   rI   z0Optional[Union[float, Tuple[float, float], Any]]request_timeoutheadersr   tiktoken_enabledtiktoken_model_nameFshow_progress_barzDict[str, Any]model_kwargsr    zUnion[Mapping[str, str], None]default_headersz!Union[Mapping[str, object], None]default_query   retry_min_seconds   retry_max_secondszUnion[Any, None]http_clienthttp_async_clientcheck_embedding_ctx_lengthZforbidr"   )extraZpopulate_by_nameZprotected_namespacesbefore)mode)valuesr!   c              
   C  s   t | }|di }t|D ]P}||v r8td| d||vrtd| d| d| d ||||< q|| }|rtd| d	||d< |S )
z>Build extra kwargs from additional params that were passed in.r^   zFound z supplied twice.z	WARNING! z/ is not default parameter.
                    zJ was transferred to model_kwargs.
                    Please confirm that z is what you intended.zParameters za should be specified explicitly. Instead they were passed in as part of `model_kwargs` parameter.)	r   getlist
ValueErrorwarningswarnpopintersectionkeys)clsrk   Zall_required_field_namesrh   
field_nameZinvalid_model_kwargsr"   r"   r&   build_extra  s*    
zOpenAIEmbeddings.build_extraafterr   r!   c           	   
   C  s  | j dv rtd| jr"| j nd| j| j| j| j| j| j	d}| j
r| jsT| jr| j
}| j}| j}td|d|d|| js| j
r| jszddl}W n. ty } ztd	|W Y d}~n
d}~0 0 |j| j
d
| _d| ji}tjf i ||j| _| js| j
rh| jshzddl}W n0 tyV } ztd	|W Y d}~n
d}~0 0 |j| j
d
| _d| ji}tjf i ||j| _| S )z?Validate that api key and python package exists in environment.)ZazureZazure_adZazureadzEIf you are using Azure, please use the `AzureOpenAIEmbeddings` class.N)rP   rR   rK   rX   rW   r_   r`   zwCannot specify 'openai_proxy' if one of 'http_client'/'http_async_client' is already specified. Received:
openai_proxy=z
http_client=z
http_async_client=r   zRCould not import httpx python package. Please install it with `pip install httpx`.)proxyre   )rM   rn   rQ   Zget_secret_valuerS   rL   rY   rW   r_   r`   rN   re   rf   rA   httpxImportErrorZClientopenaiZOpenAIr;   rB   ZAsyncClientZAsyncOpenAI)	selfZclient_paramsrN   re   rf   rz   eZsync_specificZasync_specificr"   r"   r&   validate_environment*  sp    



z%OpenAIEmbeddings.validate_environmentc                 C  s(   d| j i| j}| jd ur$| j|d< |S )NrD   rE   )rD   r^   rE   )r}   paramsr"   r"   r&   _invocation_paramsd  s    

z#OpenAIEmbeddings._invocation_paramsz	List[str]z<Tuple[Iterable[int], List[Union[List[int], str]], List[int]])textsrV   r!   c                 C  s   g }g }| j p| j}| jszddlm} W n tyD   tdY n0 |j|d}t|D ]^\}}	|j	|	dd}
t
dt|
| jD ]4}|
||| j  }||}|| || qqZnzt|}W n ty   td}Y n0 dd	 | j| jd
 D }t|D ]\}}	| jdr.|	dd}	|rH|j	|	fi |}n
||	}t
dt|| jD ](}||||| j   || qdq| jrz$ddlm} |t
dt||}W n$ ty   t
dt||}Y n0 nt
dt||}|||fS )a  
        Take the input `texts` and `chunk_size` and return 3 iterables as a tuple:

        We have `batches`, where batches are sets of individual texts
        we want responses from the openai api. The length of a single batch is
        `chunk_size` texts.

        Each individual text is also split into multiple texts based on the
        `embedding_ctx_length` parameter (based on number of tokens).

        This function returns a 3-tuple of the following:

        _iter: An iterable of the starting index in `tokens` for each *batch*
        tokens: A list of tokenized texts, where each text has already been split
            into sub-texts based on the `embedding_ctx_length` parameter. In the
            case of tiktoken, this is a list of token arrays. In the case of
            HuggingFace transformers, this is a list of strings.
        indices: An iterable of the same length as `tokens` that maps each token-array
            to the index of the original text in `texts`.
        r   )AutoTokenizerzCould not import transformers python package. This is needed for OpenAIEmbeddings to work without `tiktoken`. Please install it with `pip install transformers`. )Zpretrained_model_name_or_pathF)Zadd_special_tokensZcl100k_basec                 S  s   i | ]\}}|d ur||qS r*   r"   )r$   kvr"   r"   r&   
<dictcomp>  s   z.OpenAIEmbeddings._tokenize.<locals>.<dictcomp>)rT   rU   Z001
 )tqdm)r\   rD   r[   Ztransformersr   r{   rn   Zfrom_pretrained	enumerateencoder7   r8   rO   decoder9   tiktokenZencoding_for_modelKeyErrorZget_encodingrT   rU   itemsendswithreplaceZencode_ordinaryr]   Z	tqdm.autor   )r}   r   rV   r   r   Z
model_namer   Z	tokenizerr1   textZ	tokenizedjZtoken_chunkZ
chunk_textencodingZencoder_kwargstokenr   _iterr"   r"   r&   	_tokenizek  s`    



zOpenAIEmbeddings._tokenize)rV   r   )r   enginerV   r!   c                  s   |pj }||\}}}g }|D ]R}	jjf d||	|	|  ij}
t|
ts^|
 }
|dd |
d D  q$t	t
||||j}d dd fdd	fd
d|D S )al  
        Generate length-safe embeddings for a list of texts.

        This method handles tokenization and embedding generation, respecting the
        set embedding context length and chunk size. It supports both tiktoken
        and HuggingFace tokenizer based on the tiktoken_enabled flag.

        Args:
            texts (List[str]): A list of texts to embed.
            engine (str): The engine or model to use for embeddings.
            chunk_size (Optional[int]): The size of chunks for processing embeddings.

        Returns:
            List[List[float]]: A list of embeddings for each input text.
        inputc                 s  s   | ]}|d  V  qdS r0   Nr"   r$   rr"   r"   r&   r-     r(   z<OpenAIEmbeddings._get_len_safe_embeddings.<locals>.<genexpr>dataNList[float]rx   c                    sF    d u rBj jf ddij} t| ts2|  } | d d d   S Nr    r   r   r0   )rA   creater   
isinstancedict
model_dumpZaverage_embedded_cached_empty_embeddingr}   r"   r&   empty_embedding  s    

zBOpenAIEmbeddings._get_len_safe_embeddings.<locals>.empty_embeddingc                   s   g | ]}|d ur|n  qS r*   r"   r$   r~   r   r"   r&   r'     r(   z=OpenAIEmbeddings._get_len_safe_embeddings.<locals>.<listcomp>)rV   r   rA   r   r   r   r   r   extendr=   r8   r    r}   r   r   rV   _chunk_sizer   r   r   r   r1   responser;   r"   r   r   r}   r&   _get_len_safe_embeddings  s$    


z)OpenAIEmbeddings._get_len_safe_embeddingsc                  s   |pj }||\}}}g }|p(j }tdt||D ]X}	jjf d||	|	|  ijI dH }
t|
tsz|
	 }
|
dd |
d D  q:tt||||j}d dd fd	d
fdd|D I dH S )a  
        Asynchronously generate length-safe embeddings for a list of texts.

        This method handles tokenization and asynchronous embedding generation,
        respecting the set embedding context length and chunk size. It supports both
        `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag.

        Args:
            texts (List[str]): A list of texts to embed.
            engine (str): The engine or model to use for embeddings.
            chunk_size (Optional[int]): The size of chunks for processing embeddings.

        Returns:
            List[List[float]]: A list of embeddings for each input text.
        r   r   Nc                 s  s   | ]}|d  V  qdS r   r"   r   r"   r"   r&   r-     r(   z=OpenAIEmbeddings._aget_len_safe_embeddings.<locals>.<genexpr>r   r   rx   c                    sL    d u rHj jf ddijI d H } t| ts8|  } | d d d   S r   )rB   r   r   r   r   r   r   r   r"   r&   r   #  s    

zCOpenAIEmbeddings._aget_len_safe_embeddings.<locals>.empty_embeddingc                   s$   g | ]}|d ur|n
  I d H qS r*   r"   r   r   r"   r&   r'   .  r(   z>OpenAIEmbeddings._aget_len_safe_embeddings.<locals>.<listcomp>)rV   r   r7   r8   rB   r   r   r   r   r   r   r=   r    r   r"   r   r&   _aget_len_safe_embeddings  s&    



z*OpenAIEmbeddings._aget_len_safe_embeddingsz
int | Nonec                 C  s   |p| j }| js~g }tdt|| j D ]R}| jjf d||||  i| j}t|ts`| }|	dd |d D  q&|S t
t| j}| j||dS )aM  Call out to OpenAI's embedding endpoint for embedding search docs.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size of embeddings. If None, will use the chunk size
                specified by the class.

        Returns:
            List of embeddings, one for each text.
        r   r   c                 s  s   | ]}|d  V  qdS r   r"   r   r"   r"   r&   r-   F  r(   z3OpenAIEmbeddings.embed_documents.<locals>.<genexpr>r   r   )rV   rg   r7   r8   rA   r   r   r   r   r   r   rC   rF   r   r}   r   rV   Zchunk_size_r;   r1   r   r   r"   r"   r&   embed_documents0  s    


z OpenAIEmbeddings.embed_documentsc                   s   |p| j }| jsg }tdt||D ]X}| jjf d||||  i| jI dH }t|tsd| }|	dd |d D  q$|S t
t| j}| j||dI dH S )aS  Call out to OpenAI's embedding endpoint async for embedding search docs.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size of embeddings. If None, will use the chunk size
                specified by the class.

        Returns:
            List of embeddings, one for each text.
        r   r   Nc                 s  s   | ]}|d  V  qdS r   r"   r   r"   r"   r&   r-   d  r(   z4OpenAIEmbeddings.aembed_documents.<locals>.<genexpr>r   r   )rV   rg   r7   r8   rB   r   r   r   r   r   r   rC   rF   r   r   r"   r"   r&   aembed_documentsN  s    


z!OpenAIEmbeddings.aembed_documentsr   )r   r!   c                 C  s   |  |gd S )zCall out to OpenAI's embedding endpoint for embedding query text.

        Args:
            text: The text to embed.

        Returns:
            Embedding for the text.
        r   )r   )r}   r   r"   r"   r&   embed_queryl  s    	zOpenAIEmbeddings.embed_queryc                   s   |  |gI dH }|d S )zCall out to OpenAI's embedding endpoint async for embedding query text.

        Args:
            text: The text to embed.

        Returns:
            Embedding for the text.
        Nr   )r   )r}   r   r;   r"   r"   r&   aembed_queryw  s    	zOpenAIEmbeddings.aembed_query)N)N)6__name__
__module____qualname____doc__r   rA   __annotations__rB   rD   rE   rF   r   rJ   rL   rM   rN   rO   r   rQ   rS   rT   rU   rV   rW   rY   rZ   r[   r\   r]   r   r^   r    r_   r`   rb   rd   re   rf   rg   r   Zmodel_configr   classmethodrv   r   propertyr   r   r   r   r   r   r   r   r"   r"   r"   r&   r>   [   s   
Q



9c23  r>   )&
__future__r   loggingro   typingr   r   r   r   r   r   r	   r
   r   r   r   r   r|   r   Zlangchain_core.embeddingsr   Zlangchain_core.utilsr   r   r   Zpydanticr   r   r   r   r   Ztyping_extensionsr   	getLoggerr   loggerr=   r>   r"   r"   r"   r&   <module>   s   8
=