a
    ag                     @  sF  d Z ddlmZ ddlZddlZddlZddlZddlZddl	Z	ddl
m
Z
mZ ddlmZmZmZmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lm Z m!Z! ddl"m#Z#m$Z$m%Z% ddl"m&Z' ddl"m(Z) ddl*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5m6Z6 ddl4m7Z8 ddl9m:Z:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@mAZA ddlBmCZC ddlDmEZE ddlFmGZG ddlHmIZI ddlJmKZK ddlLmMZMmNZNmOZO ddlPmQZR ddlSm&ZT ddlSmUZUmVZV erddlWZXeYeZZ[eeg eeIe#f f eee\gef e#eIf Z]eeg eeIe#f f ef Z^G dd de_Z`G d d! d!e\ZaG d"d# d#e\Zbdd%d&d'd(d)d*Zcd+d&d,d-d.ZdG d/d0 d0eGZed+d1d,d2d3Zfd4d5d6d7d8d9Zgd4d:d5d6d;d<d=Zhd4d'd5d6d>d?d@Zid'dAdBdCdDdEdFdGZjdHdIdJdKdLdMZkdHdIdJdNdOdPZldHdIdJdQdRdSZmdTdUd&dCdIdJdJdJdVdW	dXdYZndHdIdIdIdZd[d\d]ZodHd&dCdIdIdId^d_d`daZpdddddbdcd+dIddd5dedfdgdhdiZqddddjdkd+dddId5dedldmdndoZrddpd4dqd'd5drdsdtduZsddddjdcd+dddId5dedfdvdwdxZtddddjdkd+dddId5dedydmdzd{Zuddpd4dqd'd5drdsd|d}Zvdd~d&d%d&dedIdddddZwG dd deGddZxejyG dd dZzddddZ{dd6dddZ|dZ}dddddddddd&d%dBdddJdeddJdd+dddZ~dddddddddd&d%dBdddJdeddJdd+dddZdZee_ edde~_ dS )z>Utilities for running language models or Chains over datasets.    )annotationsN)datetimetimezone)	TYPE_CHECKINGAnyCallableDictListOptionalTupleUnioncast)warn_deprecated)	Callbacks)BaseLanguageModel)BaseMessagemessages_from_dict)
ChatResult	LLMResult)RunnableRunnableConfigRunnableLambdaconfig)utils)EvaluatorCallbackHandlerwait_for_all_evaluators)LangChainTracer)Client)get_git_infoget_langchain_env_var_metadata)EvaluationResultRunEvaluator)run_evaluator)as_runnableis_traceable_function)DatasetDataTypeExampleRunTracerSession)LangSmithError)	HTTPError)	TypedDict)Chain)load_evaluator)EvaluatorTypePairwiseStringEvaluatorStringEvaluator)
evaluation)name_generationprogressc                   @  s   e Zd ZdZdS )InputFormatErrorz(Raised when the input format is invalid.N)__name__
__module____qualname____doc__ r;   r;   u/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain/smith/evaluation/runner_utils.pyr6   N   s   r6   c                   @  s,   e Zd ZdZddddZddddZdS )	
TestResultz1A dictionary of the results of a single test run.pd.DataFramereturnc                 C  s.   |   }dd |jD }|jddj|ddS )zReturn quantiles for the feedback scores.

        This method calculates and prints the quantiles for the feedback scores
        across all feedback keys.

        Returns:
            A DataFrame containing the quantiles for each feedback key.
        c                 S  s6   g | ].}| d s.| ds.|dv s.| dr|qS )inputs.outputs.>   inputoutput	reference)
startswith).0colr;   r;   r<   
<listcomp>e   s   


z5TestResult.get_aggregate_feedback.<locals>.<listcomp>all)include   )Zaxis)to_dataframecolumnsZdescribeZdrop)selfZdfZto_dropr;   r;   r<   get_aggregate_feedbackX   s
    z!TestResult.get_aggregate_feedbackc              
   C  sT  zddl }W n. ty: } ztd|W Y d}~n
d}~0 0 g }g }| d  D ]\}}|d }|d}t|trdd | D }	n|du ri }	nd|i}	i d	d |d
  D |	}
d|v rt|d tr|
dd |d  D  n|d |
d< |
i dd |D |d|d |dd ||
 || qP|j||dS )z#Convert the results to a dataframe.r   NzfPandas is required to convert the results to a dataframe. to install pandas, run `pip install pandas`.resultsfeedbackrD   c                 S  s   i | ]\}}d | |qS )rB   r;   rG   kvr;   r;   r<   
<dictcomp>       z+TestResult.to_dataframe.<locals>.<dictcomp>c                 S  s   i | ]\}}d | |qS )rA   r;   rS   r;   r;   r<   rV      rW   rC   rE   c                 S  s   i | ]\}}d | |qS )z
reference.r;   rS   r;   r;   r<   rV      rW   c                 S  s   i | ]}d |j  |jqS )z	feedback.)keyZscore)rG   fr;   r;   r<   rV      rW   Errorexecution_timerun_id)errorr[   r\   )index)	pandasImportErroritemsget
isinstancedictupdateappendZ	DataFrame)rO   pdeindicesrecords
example_idresultrR   Zoutput_rD   rr;   r;   r<   rM   o   sR    


zTestResult.to_dataframeN)r7   r8   r9   r:   rP   rM   r;   r;   r;   r<   r=   U   s   r=   c                      s:   e Zd ZdZdddd fddZddd	d
dZ  ZS )	EvalErrorz"Your architecture raised an error.BaseExceptionr   None)rZ   kwargsr@   c                   s   t  jf d|i| d S )NrZ   )super__init__)rO   rZ   rq   	__class__r;   r<   rs      s    zEvalError.__init__str)namer@   c                 C  s2   z
| | W S  t y,   td| dY n0 d S )Nz%'EvalError' object has no attribute '')KeyErrorAttributeError)rO   rw   r;   r;   r<   __getattr__   s    
zEvalError.__getattr__)r7   r8   r9   r:   rs   r{   __classcell__r;   r;   rt   r<   rn      s   rn   <my_dataset>MODEL_OR_CHAIN_FACTORYrv   MCF)llm_or_chain_factorydataset_namer@   c                   sd  t | trR|   jj}| jdurF jjj}td| d| d| d fddS t | tr`| S t | trz| fddS t| r`t	| rt
tt| fd	dS z
|  }W nN t y   tt| }t|}td
| d t|fdd Y S 0 tt| t |tr|S t	tt|rDt
tt|fddS t |ts\fddS S | S )zForgive the user if they pass in a chain without memory instead of a chain
    factory. It's a common mistake. Raise a more helpful error message as well.Na$  Cannot directly evaluate a chain with stateful memory. To evaluate this chain, pass in a chain constructor that initializes fresh memory each time it is called.  This will safegaurd against information leakage between dataset examples.
For example:

def chain_constructor():
    new_memory = z(...)
    return z*(memory=new_memory, ...)

run_on_dataset("z", chain_constructor, ...)c                     s    S Nr;   r;   )chainr;   r<   <lambda>   rW   z(_wrap_in_chain_factory.<locals>.<lambda>c                     s    S r   r;   r;   )lcfr;   r<   r      rW   c                     s    S r   r;   r;   	runnable_r;   r<   r      rW   zWrapping function z as RunnableLambda.c                     s    S r   r;   r;   )wrappedr;   r<   r      rW   c                     s    S r   r;   r;   r   r;   r<   r      rW   c                     s   t  S r   )r   r;   )constructorr;   r<   r      rW   )rc   r.   ru   r7   Zmemory
ValueErrorr   r   callabler%   r$   r   r   	TypeErrorinspect	signatureloggerinfor   )r   r   Zchain_classZmemory_class_modelZ	user_funcsigr;   )r   r   r   r   r   r<   _wrap_in_chain_factory   sT    










r   zDict[str, Any])inputsr@   c                 C  s4  | st dg }d| v rJt| d ts>t dt| d j | d g}nd| v rt| d trvtdd | d D st dt| d j | d }nnt| dkrtt	| 
 }t|tr|g}n0t|trtd	d |D r|}nt d
|  nt d|  t|dkr|d S t dt| ddS )zGet prompt from inputs.

    Args:
        inputs: The input dictionary.

    Returns:
        A string prompt.
    Raises:
        InputFormatError: If the input format is invalid.
    Inputs should not be empty.promptz"Expected string for 'prompt', got promptsc                 s  s   | ]}t |tV  qd S r   rc   rv   rG   ir;   r;   r<   	<genexpr>   s   z_get_prompt.<locals>.<genexpr>z,Expected list of strings for 'prompts', got rL   c                 s  s   | ]}t |tV  qd S r   r   r   r;   r;   r<   r   
  rW   z)LLM Run expects string prompt input. Got z5LLM Run expects 'prompt' or 'prompts' in inputs. Got r   z)LLM Run expects single prompt input. Got z	 prompts.N)r6   rc   rv   typer7   listrJ   lennextitervalues)r   r   Zprompt_r;   r;   r<   _get_prompt   sD    

r   c                   @  s   e Zd ZU dZded< dS )ChatModelInputzVInput for a chat model.

    Parameters:
        messages: List of chat messages.
    zList[BaseMessage]messagesNr7   r8   r9   r:   __annotations__r;   r;   r;   r<   r     s   
r   rd   c                 C  s   | st d|  }d| v r,|d|d< n t| dkrLtt|  |d< d|v r|d }t|tr~t	dd |D r~|g}t|dkrt
|d |d< nt d|S t d	|  d
S )zGet Chat Messages from inputs.

    Args:
        inputs: The input dictionary.

    Returns:
        A list of chat messages.
    Raises:
        InputFormatError: If the input format is invalid.
    r   r   rC   rL   c                 s  s   | ]}t |tV  qd S r   )rc   rd   r   r;   r;   r<   r   8  s   z _get_messages.<locals>.<genexpr>r   zGBatch messages not supported. Please provide a single list of messages.zMChat Run expects single List[dict] or List[List[dict]] 'messages' input. Got N)r6   copypopr   r   r   r   rc   r   rJ   r   )r   Z
input_copyZraw_messagesr;   r;   r<   _get_messages$  s0    r   r(   Optional[Callable[[Dict], Any]]rp   )first_exampleinput_mapperr@   c                 C  s   |rP|| j }t|tst|tr4tdd |D std| dt| dnVzt| j  W nF ty   zt| j  W n$ ty   td| j  dY n0 Y n0 d S )Nc                 s  s   | ]}t |tV  qd S r   rc   r   rG   msgr;   r;   r<   r   T  rW   z>_validate_example_inputs_for_language_model.<locals>.<genexpr>zWhen using an input_mapper to prepare dataset example inputs for an LLM or chat model, the output must a single string or a list of chat messages.
Got: 	 of type .zvExample inputs do not match language model input format. Expected a dictionary with messages or a single prompt. Got: z Please update your dataset OR provide an input_mapper to convert the example.inputs to a compatible format for the llm or chat model you wish to evaluate.)	r   rc   rv   r   rJ   r6   r   r   r   )r   r   Zprompt_inputr;   r;   r<   +_validate_example_inputs_for_language_modelL  s2    

r   r.   )r   r   r   r@   c                 C  s   |rb|| j }t|j|}t|tsBtd| dt| d|rtd|j d|  nP| j }t|j|}t	|dkrt	|jdkrn|rtd|j d|  dS )	z<Validate that the example inputs match the chain input keys.zvWhen using an input_mapper to prepare dataset example inputs for a chain, the mapped value must be a dictionary.
Got: r   r   zAMissing keys after loading example using input_mapper.
Expected: z. Got: rL   zExample inputs missing expected chain input keys. Please provide an input_mapper to convert the example.inputs to a compatible format for the chain you wish to evaluate.Expected: N)
r   set
input_keys
differencerc   rd   r6   r   keysr   )r   r   r   Zfirst_inputsZmissing_keysr;   r;   r<   "_validate_example_inputs_for_chainm  s>    

r   )exampler   r   r@   c                 C  sR   t |trt| | n8| }t |tr4t| || nt |trNtd|  dS )z9Validate that the example inputs are valid for the model.zSkipping input validation for N)rc   r   r   r.   r   r   r   debug)r   r   r   r   r;   r;   r<   _validate_example_inputs  s    


r   List[Example]"Optional[smith_eval.RunEvalConfig]r'   zOptional[List[RunEvaluator]])r   examplesr3   	data_typer@   c           	      C  s   |rzt | trd\}}d}n2d}|  }t |tr6|jnd}t |trJ|jnd}t||||d jrnt|d jnd||}nd}|S )z<Configure the evaluators to run on the results of the chain.)NNllmr   Nr   )rc   r   r.   r   Zoutput_keys_load_run_evaluatorsoutputsr   )	r   r   r3   r   
run_inputsrun_outputsrun_typer   run_evaluatorsr;   r;   r<   _setup_evaluation  s$    

r   zsmith_eval.RunEvalConfigOptional[List[str]]Optional[str])r   r   r@   c                 C  sz   d }| j r6| j }|rv||vrvtd| d| d n@|rPt|dkrP|d }n&|d urvt|dkrvtd| d |S )Nz
Input key z% not in chain's specified input keys '. Evaluation behavior may be undefined.rL   r   z#Chain expects multiple input keys: z, Evaluator is likely to fail. Evaluation behavior may be undefined. Specify an input_key in the RunEvalConfig to avoid this warning.)	input_keyr   warningr   )r   r   r   r;   r;   r<   _determine_input_key  s     

r   )r   r   r@   c                 C  sz   d }| j r6| j }|rv||vrvtd| d| d n@|rPt|dkrP|d }n&|d urvt|dkrvtd| d |S )NzPrediction key z& not in chain's specified output keys r   rL   r   z$Chain expects multiple output keys: zl, Evaluation behavior may be undefined. Specify a prediction_key in the RunEvalConfig to avoid this warning.)prediction_keyr   r   r   )r   r   r   r;   r;   r<   _determine_prediction_key  s     

r   )r   example_outputsr@   c                 C  sT   | j r.| j }|rP||vrPtd| d| n"|rLt|dkrLt|d }nd }|S )NzReference key z! not in Dataset example outputs: rL   r   )reference_keyr   r   r   )r   r   r   r;   r;   r<   _determine_reference_key  s    r   zYUnion[smith_eval_config.SINGLE_EVAL_CONFIG_TYPE, smith_eval_config.CUSTOM_EVALUATOR_TYPE]zOptional[BaseLanguageModel]r"   )	eval_configeval_llmr   r   r   r   r   r   r@   c              	   C  sF  t | tr| S t | ttfrBt | ts.t| } t| |d}| j}	nt | tjrd|i|  }
t| j	fi |
}| j	j}	t | tj
r| jp|}| jp|}| jp|}n"t| rt| S tdt|  t |tr|jr|d u rtd|	 d| dtjj|||||||	gd}n.t |tr2td|	 d	ntd|	 d
|S )N)r   r   zUnknown evaluator type: zPMust specify reference_key in smith_eval.RunEvalConfig to use evaluator of type z) with dataset with multiple output keys: r   )r   r   r   tagszRun evaluator for z is not implemented. PairwiseStringEvaluators compare the outputs of two different models rather than the output of a single model. Did you mean to use a StringEvaluator instead?
See: https://python.langchain.com/docs/guides/evaluation/string/z is not implemented)rc   r"   r0   rv   r/   valuesmith_eval_configZ
EvalConfigZ
get_kwargsZevaluator_typeZSingleKeyEvalConfigr   r   r   r   run_evaluator_decr   r   r2   Zrequires_reference
smith_evalStringRunEvaluatorChainfrom_run_and_data_typer1   NotImplementedError)r   r   r   r   r   r   r   r   Z
evaluator_Zeval_type_tagrq   r#   r;   r;   r<   _construct_run_evaluator  sX    



	
	
r   z2Tuple[Optional[str], Optional[str], Optional[str]])r   r   r   r   r@   c                 C  s(   t | |}t| |}t| |}|||fS r   )r   r   r   )r   r   r   r   r   r   r   r;   r;   r<   	_get_keysH  s    


r   zList[RunEvaluator])r   r   r   r   r   r   r@   c                 C  s   g }d\}}}	| j s.| jrBtdd | jD rBt| |||\}}}	| j D ]&}
t|
| j||||	||}|| qH| jpxg }|D ]l}t|tr|| q~t|t	r|t
jj||||||	d q~t|r|t| q~td| dq~|S )z
    Load run evaluators from a configuration.

    Args:
        config: Configuration for the run evaluators.

    Returns:
        A list of run evaluators.
    )NNNc                 S  s   g | ]}t |tqS r;   )rc   r2   )rG   rh   r;   r;   r<   rI   i  rW   z(_load_run_evaluators.<locals>.<listcomp>)r   r   r   zUnsupported custom evaluator: z+. Expected RunEvaluator or StringEvaluator.)
evaluatorscustom_evaluatorsanyr   r   r   rf   rc   r"   r2   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r#   r   Zcustom_evaluatorr;   r;   r<   r   T  sV    








r   r   	callbacksr   metadatar   r   Optional[Dict[str, Any]]zUnion[str, BaseMessage])r   r   r   r   r   r   r@   c          
        s   |durn||}t |ts6t |tr\tdd |D r\| j|t||pFg |pLi ddI dH S td| dn~z2t|}| j|t||pg |pi ddI dH }W nJ ty   t|}	| jf i |	dt||pg |pi diI dH }Y n0 |S )	a  Asynchronously run the language model.

    Args:
        llm: The language model to run.
        inputs: The input dictionary.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.
        input_mapper: Optional function to map inputs to the expected format.

    Returns:
        The LLMResult or ChatResult.
    Raises:
        ValueError: If the LLM type is unsupported.
        InputFormatError: If the input format is invalid.
    Nc                 s  s   | ]}t |tV  qd S r   r   r   r;   r;   r<   r     rW   z_arun_llm.<locals>.<genexpr>r   r   r   r   z%Input mapper returned invalid format 3
Expected a single string or list of chat messages.r   )	rc   rv   r   rJ   ainvoker   r6   r   r   )
r   r   r   r   r   r   prompt_or_messagesr   
llm_output
llm_inputsr;   r;   r<   	_arun_llm  sJ    r   r   r   r   zUnion[Chain, Runnable]zUnion[dict, str])r   r   r   r   r   r   r@   c          
        s   |du r|n||}t | trrt |trrt|dkrr| jrrtt| }| j|t	||pZg |p`i ddI dH }n*t	|pzg ||pi d}	| j||	dI dH }|S )z%Run a chain asynchronously on inputs.NrL   r   r   r   r   r   )
rc   r.   rd   r   r   r   r   r   r   r   
r   r   r   r   r   r   Zinputs_valrD   runnable_configr;   r;   r<   _arun_chain  s*    

r   )r   r   z'Union[dict, str, LLMResult, ChatResult])r   r   r   r   r@   c          	        s   t |trdnd}d}znt |trNt|| j|d |d ||ddI dH }n0| }t|| j|d |d ||ddI dH }|}W nV ty } z>t| d| j	 d	| j d
t
|  t|d}W Y d}~n
d}~0 0 |S )a  Asynchronously run the Chain or language model.

    Args:
        example: The example to run.
        llm_or_chain_factory: The Chain or language model constructor to run.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.
        input_mapper: Optional function to map the input to the expected format.

    Returns:
        A list of outputs.
    LLMr.   Nr   r   r   r    failed for example  with inputs 
rZ   )rc   r   r   r   rb   r   	Exceptionr   r   idreprrn   )	r   r   r   r   chain_or_llmrl   rD   r   rh   r;   r;   r<   _arun_llm_or_chain  sB    
	 r   )r   r   r   r   r   r   r@   c          
      C  s   |durj||}t |ts6t |trXtdd |D rX| j|t||pFg |pLi dd}qtd| dnlz,t|}| j|t||pg |pi dd}W n> ty   t|}	| jf i |	dt||pi d	i}Y n0 |S )
a  
    Run the language model on the example.

    Args:
        llm: The language model to run.
        inputs: The input dictionary.
        callbacks: The callbacks to use during the run.
        tags: Optional tags to add to the run.
        input_mapper: function to map to the inputs dictionary from an Example
    Returns:
        The LLMResult or ChatResult.
    Raises:
        ValueError: If the LLM type is unsupported.
        InputFormatError: If the input format is invalid.
    Nc                 s  s   | ]}t |tV  qd S r   r   r   r;   r;   r<   r   N  rW   z_run_llm.<locals>.<genexpr>r   r   z'Input mapper returned invalid format:  r   r   )r   r   )	rc   rv   r   rJ   invoker   r6   r   r   )
r   r   r   r   r   r   r   r   Zllm_promptsr   r;   r;   r<   _run_llm0  sF    
r   zUnion[Dict, str]c          
      C  s   |du r|n||}t | trlt |trlt|dkrl| jrltt| }| j|t	||pZg |p`i dd}n$t	|ptg ||p|i d}	| j||	d}|S )zRun a chain on inputs.NrL   r   r   r   )
rc   r.   rd   r   r   r   r   r   r   r   r   r;   r;   r<   
_run_chainn  s*    

r   c          
      C  s   t |trdnd}d}zbt |trHt|| j|d |d ||dd}n*| }t|| j|d |d ||dd}|}W nb ty } zJt|j}	t	
| d| j d	| j d
|	 d| 	 t|d}W Y d}~n
d}~0 0 |S )a  
    Run the Chain or language model synchronously.

    Args:
        example: The example to run.
        llm_or_chain_factory: The Chain or language model constructor to run.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.

    Returns:
        Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
          The outputs of the model or chain.
    r   r.   Nr   r   r   r   r   r   z
Error Type: z, Message: r   )rc   r   r   r   rb   r   r   r   r7   r   r   r   rn   )
r   r   r   r   r   rl   rD   r   rh   Z
error_typer;   r;   r<   _run_llm_or_chain  sH    
	
 r   r   zOptional[Union[str, datetime]]z1Tuple[MCF, TracerSession, Dataset, List[Example]])clientr   r   project_nameproject_metadatar   dataset_versionr@   c              
   C  sp  t ||}| j|d}t| j|j|d}	|	s>td| ddd |	D }
|
rXt|
nd }|rh| nd }zN|pti }t }|ri |d|i}||d< | j	||j|rd	|ini |d
}W np t
ttfy* } zPdt|vr|t }d| d| d| d}td| d| W Y d }~n
d }~0 0 |jd|j  }td| d| d| d|j dd ||||	fS )N)r   )Z
dataset_idZas_ofzDataset z has no example rows.c                 S  s   g | ]}|j r|j qS r;   )modified_at)rG   exr;   r;   r<   rI     rW   z%_prepare_eval_run.<locals>.<listcomp>gitr  r   )Zreference_dataset_idZproject_extrar   zalready exists z+
run_on_dataset(
    ...
    project_name="z - z", # Update since z already exists
)
zTest project z/ already exists. Please use a different name:

z/compare?selectedSessions=z)View the evaluation results for project 'z' at:
z

View all tests for Dataset z at:
T)flush)r   Zread_datasetr   Zlist_examplesr   r   max	isoformatr   Zcreate_projectr,   r+   rv   uuiduuid4urlprint)r   r   r   r  r  r   r  wrapped_modeldatasetr   r  Zmax_modified_atZinferred_versionZgit_infoprojectrh   uidZexample_msgZcomparison_urlr;   r;   r<   _prepare_eval_run  sh    	

r  c                   @  s*   e Zd ZU dZded< ded< ded< dS )	
_RowResultz5A dictionary of the results for a single example row.z Optional[List[EvaluationResult]]rR   zOptional[float]r[   r   r\   Nr   r;   r;   r;   r<   r    s   
r  F)totalc                   @  s   e Zd ZU dZded< ded< ded< ded	< d
ed< dZded< ddddddZdddddZddddZddd d!d"Z	d5dd$dd%d&d'Z
ed6dd)d*d+d,d-d.d/d0d+d1d d2d3d4ZdS )7_DatasetRunContainerz3A container to help manage the state of a eval run.r   r   r*   r  r   r  r   r   zList[RunnableConfig]configsNz6Optional[List[smith_eval_config.BATCH_EVALUATOR_LIKE]]batch_evaluatorsr   zDict[str, _RowResult]rd   )batch_resultsall_eval_resultsr@   c                 C  s   i }t | j|D ]\}}tt|t|ji }|j|dg |d|dd|t|j< t|t	r~|j
|t|j d< n||t|j d< |jr|j|t|j d< q|S )NrR   r[   r\   )rC   rR   r[   r\   rZ   rD   rE   )zipr   r   r  rb   rv   r   r   rc   rn   rZ   r   )rO   r  r  rQ   r   rD   Z
row_resultr;   r;   r<   _merge_test_outputs  s    

z(_DatasetRunContainer._merge_test_outputszDict[str, Run]z
List[dict])runsr@   c           	        s   | j }|sg S  fdd| jD }g }tj }|D ]}zV||| j}t|trZ| }|t	t| |j
| jjfi |d | jjd W q6 ty } z&tdt| d|  W Y d }~q6d }~0 0 q6W d    n1 s0    Y  |S )Nc                   s   g | ]} t |j qS r;   )rv   r   rG   r   r  r;   r<   rI   0  rW   z>_DatasetRunContainer._run_batch_evaluators.<locals>.<listcomp>)r\   Z
project_idzError running batch evaluator z: )r  r   
concurrentfuturesThreadPoolExecutorrc   r!   rd   rf   r   submitr   Zcreate_feedbackr  r   r   r   r]   r   )	rO   r  r   Z	runs_listaggregate_feedbackexecutorZ	evaluatorrl   rh   r;   r  r<   _run_batch_evaluators,  s2    
:z*_DatasetRunContainer._run_batch_evaluatorsz,Tuple[Dict[str, _RowResult], Dict[str, Run]]r?   c                 C  s   i }i }| j D ]}tt|d D ]}t|trf|j}| D ]&\\}}}|t|i 	d|i q<q t|t
r |j}	|	r|	jr|	j|	j  nd }
|	rt|	jnd }|t|ji 	|
||	d |	|t|j< q qttttf ||fS )Nr   rR   )r[   r\   run)r  r   r   rc   r   Zlogged_eval_resultsra   
setdefaultrv   re   r   Z
latest_runend_time
start_timetotal_secondsr   rk   r   r  )rO   r  all_runsccallbackZeval_results_rk   rU   r&  r[   r\   r;   r;   r<   _collect_metricsE  s6    


z%_DatasetRunContainer._collect_metricsz-List[Union[dict, str, LLMResult, ChatResult]]r=   )r  r@   c                 C  sX   t d t  |  \}}d }| jr:t d | |}| ||}t| jj	||dS )Nz#Waiting for evaluators to complete.zRunning session evaluators.)r  rQ   Zaggregate_metrics)
r   r   r   r/  r  r%  r  r=   r  rw   )rO   r  r  r+  r#  rQ   r;   r;   r<   _collect_test_resultsb  s    


z*_DatasetRunContainer._collect_test_resultsFbool)r  verboser@   c              
   C  s   |  |}|r\z| }t| W n8 tyZ } z tdt|  W Y d }~n
d }~0 0 z | jj| j	j
ttjd W n8 ty } z tdt|  W Y d }~n
d }~0 0 |S )Nz$Failed to print aggregate feedback: )r(  zFailed to close project: )r0  rP   _display_aggregate_resultsr   r   r   r   r   Zupdate_projectr  r   r   nowr   utc)rO   r  r2  rQ   Zagg_feedbackrh   r;   r;   r<   finisht  s    
*
*z_DatasetRunContainer.finish   rv   r~   r   r   r   r   intr   Optional[Union[datetime, str]])r   r   r   r  r3   r   r   concurrency_levelr  revision_idr  r@   c              	     s  |p
t  }|
r&|	si }	|	d|
i t ||||	|d\}}}pJg jdpZi  D ]\}}d| d|  q`djd i|
r|
d< t|}t	||||j
ptjt|d || tt| fdd	|D }|  ||||r|jnd d
S )Nr;  )r  r   r  r  zgit:=r  r   c              
     sB   g | ]:}t tj |jd tp"g  |jddgdqS ))r  r   rk   r   )r   r   rk   max_concurrency)r   r   r=  r   )r   r   rw   r   r   r  r   r:  progress_barr  r   Zrun_metadatar   r;   r<   rI     s&   z0_DatasetRunContainer.prepare.<locals>.<listcomp>)r   r  r  r   r  r  )r4   Zrandom_namere   r  r   rb   ra   rf   r   r   r   r'   kvr   r5   ZProgressBarCallbackr   r  )clsr   r   r   r  r3   r   r   r:  r  r;  r  r  r  r   rT   rU   r  r;   r>  r<   prepare  sJ    	z_DatasetRunContainer.prepare)F)NNNr7  NNN)r7   r8   r9   r:   r   r  r  r%  r/  r0  r6  classmethodrB  r;   r;   r;   r<   r    s*   
       r  r1  r?   c                  C  sH   z.ddl m}  |  }|  d uo,dtt|v W S  tyB   Y dS 0 d S )Nr   )get_ipythonZzmqshellF)ZIPythonrD  rv   r   r`   )rD  resr;   r;   r<   _is_jupyter_environment  s    rF  r>   )aggregate_resultsr@   c                 C  sR   t  r,ddlm}m} ||d ||  n"| jdd dd}td t| d S )	Nr   )HTMLdisplayz<h3>Experiment Results:</h3>c                 S  s   | dS )Nz.2fr;   )xr;   r;   r<   r     rW   z,_display_aggregate_results.<locals>.<lambda>right)Zfloat_formatjustifyz
 Experiment Results:)rF  IPython.displayrH  rI  Z	to_stringr  )rG  rH  rI  Zformatted_stringr;   r;   r<   r3    s    
r3  a  The input_mapper argument is deprecated and will be removed in a future release. Please add a  RunnableLambda to your chain to map inputs to the expected format instead. Example:
def construct_chain():
    my_chain = ...
    input_mapper = {'other_key': 'MyOtherInput', 'my_input_key': x}
    return input_mapper | my_chain
run_on_dataset(..., llm_or_chain_factory=construct_chain)
(See https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.base.RunnableLambda.html)r7  )r3   r  r:  r  r  r2  r;  zOptional[Client]r9  r8  r   )r   r   r   r3   r  r:  r  r  r2  r;  rq   r@   c                  s   |
 dd }|rtdtdd |	d u r2t d}	|
 dd }|rPtdddd |
rntdd	|
  d
dd | pvt } tj| |||||||||	|d}t	j
|jd dgttjt|j|d|j|jR  I d H }|j||dS )Nr   0.0.305Tmessagependingr;  r   0.1.9qThe tags argument is deprecated and will be removed in a future release. Please specify project_metadata instead.PThe following arguments are deprecated and will be removed in a future release: r   rP  Zremovalr  r;  r  r   r=  r   r   r2  )r   r   _INPUT_MAPPER_DEP_WARNINGr    rb   r   r   r  rB  runnable_utilsZgather_with_concurrencyr  map	functoolspartialr   r  r   r6  )r   r   r   r3   r  r:  r  r  r2  r;  rq   r   r   	containerr  r;   r;   r<   arun_on_dataset  s`    
r_  c                  s$  |
 dd rtdtdd |
 dd }|r<tdddd |	d u rPt d}	|
rntdd	|
  d
dd | pvt } tj| ||||||||	|d |dkr fddt	 j
 jD }nVt jd 4}t|tjt jd j
 j}W d    n1 s0    Y   j||dS )Nr   rN  TrO  r   rR  rS  r;  rT  r   rU  rV  r   c                   s"   g | ]\}}t || jd qS )rW  )r   r  )rG   r   r   r^  r   r;   r<   rI   k  s   z"run_on_dataset.<locals>.<listcomp>rW  rX  )r   r   rY  r    rb   r   r   r  rB  r  r   r  r   Zget_executor_for_configr   r[  r\  r]  r   r  r6  )r   r   r   r3   r  r:  r  r  r2  r;  rq   r   r  r$  r;   r`  r<   run_on_dataset8  sf    

$ra  a1  
Run the Chain or language model on a dataset and store traces
to the specified project name.

Args:
    dataset_name: Name of the dataset to run the chain on.
    llm_or_chain_factory: Language model or Chain constructor to run
        over the dataset. The Chain constructor is used to permit
        independent calls on each example without carrying over state.
    evaluation: Configuration for evaluators to run on the
        results of the chain
    concurrency_level: The number of async tasks to run concurrently.
    project_name: Name of the project to store the traces in.
        Defaults to {dataset_name}-{chain class name}-{datetime}.
    project_metadata: Optional metadata to add to the project.
        Useful for storing information the test variant.
        (prompt version, model version, etc.)
    client: LangSmith client to use to access the dataset and to
        log feedback and run traces.
    verbose: Whether to print progress.
    tags: Tags to add to each run in the project.
    revision_id: Optional revision identifier to assign this test run to
        track the performance of different versions of your system.
Returns:
    A dictionary containing the run's project name and the resulting model outputs.


For the (usually faster) async version of this function, see :func:`arun_on_dataset`.

Examples
--------

.. code-block:: python

    from langsmith import Client
    from langchain_openai import ChatOpenAI
    from langchain.chains import LLMChain
    from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset

    # Chains may have memory. Passing in a constructor function lets the
    # evaluation framework avoid cross-contamination between runs.
    def construct_chain():
        llm = ChatOpenAI(temperature=0)
        chain = LLMChain.from_string(
            llm,
            "What's the answer to {your_input_key}"
        )
        return chain

    # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
    evaluation_config = smith_eval.RunEvalConfig(
        evaluators=[
            "qa",  # "Correctness" against a reference answer
            "embedding_distance",
            smith_eval.RunEvalConfig.Criteria("helpfulness"),
            smith_eval.RunEvalConfig.Criteria({
                "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
            }),
        ]
    )

    client = Client()
    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )

You can also create custom evaluators by subclassing the
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
or LangSmith's `RunEvaluator` classes.

.. code-block:: python

    from typing import Optional
    from langchain.evaluation import StringEvaluator

    class MyStringEvaluator(StringEvaluator):

        @property
        def requires_input(self) -> bool:
            return False

        @property
        def requires_reference(self) -> bool:
            return True

        @property
        def evaluation_name(self) -> str:
            return "exact_match"

        def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
            return {"score": prediction == reference}


    evaluation_config = smith_eval.RunEvalConfig(
        custom_evaluators = [MyStringEvaluator()],
    )

    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )
zrun_on_dataset(zawait arun_on_dataset()r}   )NNN)r:   
__future__r   concurrent.futuresr  dataclassesr\  r   loggingr
  r   r   typingr   r   r   r   r	   r
   r   r   r   Zlangchain_core._apir   Z langchain_core.callbacks.managerr   Zlangchain_core.language_modelsr   Zlangchain_core.messagesr   r   Zlangchain_core.outputsr   r   Zlangchain_core.runnablesr   r   r   r   r   r   rZ  Z!langchain_core.tracers.evaluationr   r   Z langchain_core.tracers.langchainr   Zlangsmith.clientr   Zlangsmith.envr   r    Zlangsmith.evaluationr!   r"   r#   r   Zlangsmith.run_helpersr$   r%   Zlangsmith.schemasr&   r'   r(   r)   r*   Zlangsmith.utilsr+   requestsr,   Ztyping_extensionsr-   Zlangchain.chains.baser.   Zlangchain.evaluation.loadingr/   Zlangchain.evaluation.schemar0   r1   r2   Zlangchain.smithr3   r   Zlangchain.smith.evaluationr   r4   r5   r_   rg   	getLoggerr7   r   rd   r~   r   r   r6   r=   rn   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  	dataclassr  rF  r3  rY  r_  ra  Z_RUN_ON_DATASET_DOCSTRINGreplacer;   r;   r;   r<   <module>   s   ,
I =2
(!%CGE%>C%=   = C
(F(Mk