a
    ~gg                     @  s   d dl mZ d dlZd dlZd dlmZmZmZmZm	Z	m
Z
mZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZmZ d dlmZmZmZmZmZ d dlm Z  e!e"Z#dd	d
ddddddZ$G dd deeZ%dS )    )annotationsN)AnyDictIterableListLiteralMappingOptionalSequenceSetTupleUnioncast)
Embeddings)from_envget_pydantic_field_namessecret_from_env)	BaseModel
ConfigDictField	SecretStrmodel_validator)SelfintzList[Union[List[int], str]]List[List[float]]z	List[int]boolzList[Optional[List[float]]])	num_textstokensbatched_embeddingsindices
skip_emptyreturnc           	        s&  dd t | D }dd t | D t t|D ]J |rJt|  dkrJq0||   |   |   t|   q0g }t | D ] |  }t|dkr|d  qqt|dkr||d  qqt   fddt| D }tdd |D d	 |fd
d|D  q|S )Nc                 S  s   g | ]}g qS  r"   .0_r"   r"   e/var/www/html/emsaiapi.evdpl.com/venv/lib/python3.9/site-packages/langchain_openai/embeddings/base.py
<listcomp>'       z7_process_batched_chunked_embeddings.<locals>.<listcomp>c                 S  s   g | ]}g qS r"   r"   r#   r"   r"   r&   r'   ,   r(      r   c                   s,   g | ]$}t d d t|  D  qS )c                 s  s   | ]\}}|| V  qd S Nr"   )r$   valweightr"   r"   r&   	<genexpr>K   s   zA_process_batched_chunked_embeddings.<locals>.<listcomp>.<genexpr>)sumzip)r$   	embedding)inum_tokens_in_batchtotal_weightr"   r&   r'   J   s   c                 s  s   | ]}|d  V  qdS )   Nr"   r$   r+   r"   r"   r&   r-   U   r(   z6_process_batched_chunked_embeddings.<locals>.<genexpr>g      ?c                   s   g | ]}|  qS r"   r"   r5   )	magnituder"   r&   r'   V   r(   )rangelenappendr.   r/   )	r   r   r   r   r    results
embeddings_resultZaverager"   )r1   r6   r2   r3   r&   #_process_batched_chunked_embeddings   s.    	
r=   c                   @  s  e Zd ZU dZedddZded< edddZded< dZd	ed
< dZ	ded< eZ
ded< eedddddZded< ededdddZded< eeddddZded< eeddddZded< dZded< ed ed!dddZd"ed#< ed$ed%d&gdddZded'< dZd(ed)< dZd*ed+< d,Zded-< d.Zded/< edd0d1Zd2ed3< dZded4< dZd5ed6< dZded7< d8Zd5ed9< eedZd:ed;< d8Zd5ed<< dZ d=ed>< dZ!d?ed@< dAZ"dedB< dCZ#dedD< dZ$dEedF< dZ%dEedG< dZ&d5edH< e'dIddJdKZ(e)dLdMe*d:ddNdOdPZ+e)dQdMdRdSdTdUZ,e-d:dSdVdWZ.dXddYdZd[d\Z/dd]dXd	dd^d_d`daZ0dd]dXd	dd^d_dbdcZ1dodXddd^dZdedfZ2dpdXddd^dZdgdhZ3d	didjdkdlZ4d	didjdmdnZ5dS )qOpenAIEmbeddingsu	  OpenAI embedding model integration.

    Setup:
        Install ``langchain_openai`` and set environment variable ``OPENAI_API_KEY``.

        .. code-block:: bash

            pip install -U langchain_openai
            export OPENAI_API_KEY="your-api-key"

    Key init args — embedding params:
        model: str
            Name of OpenAI model to use.
        dimensions: Optional[int] = None
            The number of dimensions the resulting output embeddings should have.
            Only supported in `text-embedding-3` and later models.

    Key init args — client params:
        api_key: Optional[SecretStr] = None
            OpenAI API key.
        organization: Optional[str] = None
            OpenAI organization ID. If not passed in will be read
            from env var OPENAI_ORG_ID.
        max_retries: int = 2
            Maximum number of retries to make when generating.
        request_timeout: Optional[Union[float, Tuple[float, float], Any]] = None
            Timeout for requests to OpenAI completion API

    See full list of supported init args and their descriptions in the params section.

    Instantiate:
        .. code-block:: python

            from langchain_openai import OpenAIEmbeddings

            embed = OpenAIEmbeddings(
                model="text-embedding-3-large"
                # With the `text-embedding-3` class
                # of models, you can specify the size
                # of the embeddings you want returned.
                # dimensions=1024
            )

    Embed single text:
        .. code-block:: python

            input_text = "The meaning of life is 42"
            vector = embeddings.embed_query("hello")
            print(vector[:3])

        .. code-block:: python

            [-0.024603435769677162, -0.007543657906353474, 0.0039630369283258915]

    Embed multiple texts:
        .. code-block:: python

            vectors = embeddings.embed_documents(["hello", "goodbye"])
            # Showing only the first 3 coordinates
            print(len(vectors))
            print(vectors[0][:3])

        .. code-block:: python

            2
            [-0.024603435769677162, -0.007543657906353474, 0.0039630369283258915]

    Async:
        .. code-block:: python

            await embed.aembed_query(input_text)
            print(vector[:3])

            # multiple:
            # await embed.aembed_documents(input_texts)

        .. code-block:: python

            [-0.009100092574954033, 0.005071679595857859, -0.0029193938244134188]
    NT)defaultexcluder   clientasync_clientztext-embedding-ada-002strmodelzOptional[int]
dimensionszOptional[str]
deploymentOPENAI_API_VERSION)r?   api_version)default_factoryaliasopenai_api_versionbase_urlOPENAI_API_BASE)rJ   rI   openai_api_baseOPENAI_API_TYPE)rI   openai_api_typeOPENAI_PROXYopenai_proxyi  r   embedding_ctx_lengthapi_keyOPENAI_API_KEYzOptional[SecretStr]openai_api_keyorganizationOPENAI_ORG_IDOPENAI_ORGANIZATIONopenai_organizationz%Union[Literal['all'], Set[str], None]allowed_specialz4Union[Literal['all'], Set[str], Sequence[str], None]disallowed_speciali  
chunk_sizer4   max_retriestimeout)r?   rJ   z0Optional[Union[float, Tuple[float, float], Any]]request_timeoutheadersr   tiktoken_enabledtiktoken_model_nameFshow_progress_barzDict[str, Any]model_kwargsr    zUnion[Mapping[str, str], None]default_headersz!Union[Mapping[str, object], None]default_query   retry_min_seconds   retry_max_secondszUnion[Any, None]http_clienthttp_async_clientcheck_embedding_ctx_lengthforbidr"   )extrapopulate_by_nameprotected_namespacesbefore)mode)valuesr!   c              
   C  s   t | }|di }t|D ]P}||v r8td| d||vrtd| d| d| d ||||< q|| }|rtd| d	||d< |S )
z>Build extra kwargs from additional params that were passed in.re   zFound z supplied twice.z	WARNING! z/ is not default parameter.
                    zJ was transferred to model_kwargs.
                    Please confirm that z is what you intended.zParameters za should be specified explicitly. Instead they were passed in as part of `model_kwargs` parameter.)	r   getlist
ValueErrorwarningswarnpopintersectionkeys)clsru   all_required_field_namesrp   
field_nameinvalid_model_kwargsr"   r"   r&   build_extra  s*    
zOpenAIEmbeddings.build_extraafterr   r!   c           	   
   C  s  | j dv rtd| jr"| j nd| j| j| j| j| j| j	d}| j
r| jsT| jr| j
}| j}| j}td|d|d|| js| j
r| jszddl}W n. ty } ztd	|W Y d}~n
d}~0 0 |j| j
d
| _d| ji}tjf i ||j| _| js| j
rh| jshzddl}W n0 tyV } ztd	|W Y d}~n
d}~0 0 |j| j
d
| _d| ji}tjf i ||j| _| S )z?Validate that api key and python package exists in environment.)azureZazure_adZazureadzEIf you are using Azure, please use the `AzureOpenAIEmbeddings` class.N)rT   rW   rL   r_   r^   rf   rg   zwCannot specify 'openai_proxy' if one of 'http_client'/'http_async_client' is already specified. Received:
openai_proxy=z
http_client=z
http_async_client=r   zRCould not import httpx python package. Please install it with `pip install httpx`.)proxyrl   )rP   rx   rV   get_secret_valuerZ   rN   r`   r^   rf   rg   rR   rl   rm   rA   httpxImportErrorClientopenaiOpenAIr;   rB   AsyncClientAsyncOpenAI)	selfclient_paramsrR   rl   rm   r   esync_specificasync_specificr"   r"   r&   validate_environment*  sp    



z%OpenAIEmbeddings.validate_environmentc                 C  s(   d| j i| j}| jd ur$| j|d< |S )NrD   rE   )rD   re   rE   )r   paramsr"   r"   r&   _invocation_paramsd  s    

z#OpenAIEmbeddings._invocation_paramsz	List[str]z<Tuple[Iterable[int], List[Union[List[int], str]], List[int]])textsr]   r!   c                 C  s   g }g }| j p| j}| jszddlm} W n tyD   tdY n0 |j|d}t|D ]^\}}	|j	|	dd}
t
dt|
| jD ]4}|
||| j  }||}|| || qqZnzt|}W n ty   td}Y n0 dd	 | j| jd
 D }t|D ]\}}	| jdr.|	dd}	|rH|j	|	fi |}n
||	}t
dt|| jD ](}||||| j   || qdq| jrz$ddlm} |t
dt||}W n$ ty   t
dt||}Y n0 nt
dt||}|||fS )a  
        Take the input `texts` and `chunk_size` and return 3 iterables as a tuple:

        We have `batches`, where batches are sets of individual texts
        we want responses from the openai api. The length of a single batch is
        `chunk_size` texts.

        Each individual text is also split into multiple texts based on the
        `embedding_ctx_length` parameter (based on number of tokens).

        This function returns a 3-tuple of the following:

        _iter: An iterable of the starting index in `tokens` for each *batch*
        tokens: A list of tokenized texts, where each text has already been split
            into sub-texts based on the `embedding_ctx_length` parameter. In the
            case of tiktoken, this is a list of token arrays. In the case of
            HuggingFace transformers, this is a list of strings.
        indices: An iterable of the same length as `tokens` that maps each token-array
            to the index of the original text in `texts`.
        r   )AutoTokenizerzCould not import transformers python package. This is needed for OpenAIEmbeddings to work without `tiktoken`. Please install it with `pip install transformers`. )Zpretrained_model_name_or_pathF)Zadd_special_tokenscl100k_basec                 S  s   i | ]\}}|d ur||qS r*   r"   )r$   kvr"   r"   r&   
<dictcomp>  s   z.OpenAIEmbeddings._tokenize.<locals>.<dictcomp>)r[   r\   Z001
 )tqdm)rc   rD   rb   transformersr   r   rx   from_pretrained	enumerateencoder7   r8   rS   decoder9   tiktokenencoding_for_modelKeyErrorget_encodingr[   r\   itemsendswithreplaceencode_ordinaryrd   Z	tqdm.autor   )r   r   r]   r   r   
model_namer   	tokenizerr1   textZ	tokenizedjZtoken_chunkZ
chunk_textencodingZencoder_kwargstokenr   _iterr"   r"   r&   	_tokenizek  s`    



zOpenAIEmbeddings._tokenize)r]   r   )r   enginer]   r!   c                  s   |pj }||\}}}g }|D ]R}	jjf d||	|	|  ij}
t|
ts^|
 }
|dd |
d D  q$t	t
||||j}d dd fdd	fd
d|D S )al  
        Generate length-safe embeddings for a list of texts.

        This method handles tokenization and embedding generation, respecting the
        set embedding context length and chunk size. It supports both tiktoken
        and HuggingFace tokenizer based on the tiktoken_enabled flag.

        Args:
            texts (List[str]): A list of texts to embed.
            engine (str): The engine or model to use for embeddings.
            chunk_size (Optional[int]): The size of chunks for processing embeddings.

        Returns:
            List[List[float]]: A list of embeddings for each input text.
        inputc                 s  s   | ]}|d  V  qdS r0   Nr"   r$   rr"   r"   r&   r-     r(   z<OpenAIEmbeddings._get_len_safe_embeddings.<locals>.<genexpr>dataNList[float]r   c                    sF    d u rBj jf ddij} t| ts2|  } | d d d   S Nr    r   r   r0   )rA   creater   
isinstancedict
model_dumpZaverage_embedded_cached_empty_embeddingr   r"   r&   empty_embedding  s    

zBOpenAIEmbeddings._get_len_safe_embeddings.<locals>.empty_embeddingc                   s   g | ]}|d ur|n  qS r*   r"   r$   r   r   r"   r&   r'     r(   z=OpenAIEmbeddings._get_len_safe_embeddings.<locals>.<listcomp>)r]   r   rA   r   r   r   r   r   extendr=   r8   r    r   r   r   r]   _chunk_sizer   r   r   r   r1   responser;   r"   r   r   r   r&   _get_len_safe_embeddings  s$    


z)OpenAIEmbeddings._get_len_safe_embeddingsc                  s   |pj }||\}}}g }|p(j }tdt||D ]X}	jjf d||	|	|  ijI dH }
t|
tsz|
	 }
|
dd |
d D  q:tt||||j}d dd fd	d
fdd|D I dH S )a  
        Asynchronously generate length-safe embeddings for a list of texts.

        This method handles tokenization and asynchronous embedding generation,
        respecting the set embedding context length and chunk size. It supports both
        `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag.

        Args:
            texts (List[str]): A list of texts to embed.
            engine (str): The engine or model to use for embeddings.
            chunk_size (Optional[int]): The size of chunks for processing embeddings.

        Returns:
            List[List[float]]: A list of embeddings for each input text.
        r   r   Nc                 s  s   | ]}|d  V  qdS r   r"   r   r"   r"   r&   r-     r(   z=OpenAIEmbeddings._aget_len_safe_embeddings.<locals>.<genexpr>r   r   r   c                    sL    d u rHj jf ddijI d H } t| ts8|  } | d d d   S r   )rB   r   r   r   r   r   r   r   r"   r&   r   #  s    

zCOpenAIEmbeddings._aget_len_safe_embeddings.<locals>.empty_embeddingc                   s$   g | ]}|d ur|n
  I d H qS r*   r"   r   r   r"   r&   r'   .  r(   z>OpenAIEmbeddings._aget_len_safe_embeddings.<locals>.<listcomp>)r]   r   r7   r8   rB   r   r   r   r   r   r   r=   r    r   r"   r   r&   _aget_len_safe_embeddings  s&    



z*OpenAIEmbeddings._aget_len_safe_embeddingsz
int | Nonec                 C  s   |p| j }| js~g }tdt|| j D ]R}| jjf d||||  i| j}t|ts`| }|	dd |d D  q&|S t
t| j}| j||dS )aM  Call out to OpenAI's embedding endpoint for embedding search docs.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size of embeddings. If None, will use the chunk size
                specified by the class.

        Returns:
            List of embeddings, one for each text.
        r   r   c                 s  s   | ]}|d  V  qdS r   r"   r   r"   r"   r&   r-   F  r(   z3OpenAIEmbeddings.embed_documents.<locals>.<genexpr>r   r   )r]   rn   r7   r8   rA   r   r   r   r   r   r   rC   rF   r   r   r   r]   Zchunk_size_r;   r1   r   r   r"   r"   r&   embed_documents0  s    


z OpenAIEmbeddings.embed_documentsc                   s   |p| j }| jsg }tdt||D ]X}| jjf d||||  i| jI dH }t|tsd| }|	dd |d D  q$|S t
t| j}| j||dI dH S )aS  Call out to OpenAI's embedding endpoint async for embedding search docs.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size of embeddings. If None, will use the chunk size
                specified by the class.

        Returns:
            List of embeddings, one for each text.
        r   r   Nc                 s  s   | ]}|d  V  qdS r   r"   r   r"   r"   r&   r-   d  r(   z4OpenAIEmbeddings.aembed_documents.<locals>.<genexpr>r   r   )r]   rn   r7   r8   rB   r   r   r   r   r   r   rC   rF   r   r   r"   r"   r&   aembed_documentsN  s    


z!OpenAIEmbeddings.aembed_documentsr   )r   r!   c                 C  s   |  |gd S )zCall out to OpenAI's embedding endpoint for embedding query text.

        Args:
            text: The text to embed.

        Returns:
            Embedding for the text.
        r   )r   )r   r   r"   r"   r&   embed_queryl  s    	zOpenAIEmbeddings.embed_queryc                   s   |  |gI dH }|d S )zCall out to OpenAI's embedding endpoint async for embedding query text.

        Args:
            text: The text to embed.

        Returns:
            Embedding for the text.
        Nr   )r   )r   r   r;   r"   r"   r&   aembed_queryw  s    	zOpenAIEmbeddings.aembed_query)N)N)6__name__
__module____qualname____doc__r   rA   __annotations__rB   rD   rE   rF   r   rK   rN   rP   rR   rS   r   rV   rZ   r[   r\   r]   r^   r`   ra   rb   rc   rd   r   re   r    rf   rg   ri   rk   rl   rm   rn   r   model_configr   classmethodr   r   propertyr   r   r   r   r   r   r   r   r"   r"   r"   r&   r>   [   s   
Q



9c23  r>   )&
__future__r   loggingry   typingr   r   r   r   r   r   r	   r
   r   r   r   r   r   r   langchain_core.embeddingsr   langchain_core.utilsr   r   r   pydanticr   r   r   r   r   typing_extensionsr   	getLoggerr   loggerr=   r>   r"   r"   r"   r&   <module>   s   8
=