a
    dg
C                     @  sD  d dl mZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
mZ d dlZd dlmZmZmZ d dlmZmZmZ d dlmZ d dlmZ erd d	lmZmZ d d
lmZ d dlmZ d dl m!Z!m"Z" ddddZ#ddddZ$ddddZ%ddddZ&ddddgdZ'G dd  d eZ(G d!d" d"e(Z)G d#d$ d$e(eZ*dS )%    )annotationsN)Path)TYPE_CHECKINGCallableDictListOptionalUnion)&DEFAULT_DEANONYMIZER_MATCHING_STRATEGYAnonymizerBaseReversibleAnonymizerBase)DeanonymizerMappingMappingDataTypecreate_anonymizer_mapping)exact_matching_strategy)get_pseudoanonymizer_mapping)AnalyzerEngineEntityRecognizerNlpEngineProviderAnonymizerEngine)ConflictResolutionStrategyOperatorConfigz'AnalyzerEngine'returnc               
   C  sD   zddl m}  W n. ty> } ztd|W Y d }~n
d }~0 0 | S )Nr   )r   Could not import presidio_analyzer, please install with `pip install presidio-analyzer`. You will also need to download a spaCy model to use the analyzer, e.g. `python -m spacy download en_core_web_lg`.)presidio_analyzerr   ImportError)r   e r    }/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_experimental/data_anonymizer/presidio.py_import_analyzer_engine!   s    r"   z'NlpEngineProvider'c               
   C  sD   zddl m}  W n. ty> } ztd|W Y d }~n
d }~0 0 | S )Nr   r   r   )presidio_analyzer.nlp_enginer   r   )r   r   r    r    r!   _import_nlp_engine_provider/   s    r$   z'AnonymizerEngine'c               
   C  sD   zddl m}  W n. ty> } ztd|W Y d }~n
d }~0 0 | S )Nr   r   \Could not import presidio_anonymizer, please install with `pip install presidio-anonymizer`.)presidio_anonymizerr   r   )r   r   r    r    r!   _import_anonymizer_engine=   s    r'   z'OperatorConfig'c               
   C  sD   zddl m}  W n. ty> } ztd|W Y d }~n
d }~0 0 | S )Nr   r   r%   )presidio_anonymizer.entitiesr   r   )r   r   r    r    r!   _import_operator_configH   s    r*   ZspacyenZen_core_web_lg)Z	lang_codeZ
model_name)Znlp_engine_namemodelsc                   @  sH   e Zd ZdZddddddd	d
dZdddddZdddddZdS )PresidioAnonymizerBasezcBase Anonymizer using Microsoft Presidio.

    See more: https://microsoft.github.io/presidio/
    NTOptional[List[str]]#Optional[Dict[str, OperatorConfig]]Optional[Dict]boolOptional[int]analyzed_fields	operatorslanguages_configadd_default_faker_operators
faker_seedc                   s   |du rt }t  t }t }t }|dur0|ntt  | _|rb fddt|	 D | _
ni | _
|rv| | ||d}	|	 }
t|
j | _|| j|
d| _| | _dS )a  
        Args:
            analyzed_fields: List of fields to detect and then anonymize.
                Defaults to all entities supported by Microsoft Presidio.
            operators: Operators to use for anonymization.
                Operators allow for custom anonymization of detected PII.
                Learn more:
                https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/
            languages_config: Configuration for the NLP engine.
                First language in the list will be used as the main language
                in self.anonymize(...) when no language is specified.
                Learn more:
                https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/
            faker_seed: Seed used to initialize faker.
                Defaults to None, in which case faker will be seeded randomly
                and provide random values.
        Nc                   s"   i | ]\}}| d d|idqS )Zcustomlambda)Zoperator_nameparamsr    ).0fieldZfaker_functionr(   r    r!   
<dictcomp>   s   z3PresidioAnonymizerBase.__init__.<locals>.<dictcomp>)Znlp_configuration)supported_languages
nlp_engine)DEFAULT_LANGUAGES_CONFIGr*   r"   r$   r'   listr   keysr4   itemsr5   add_operatorsZcreate_engineZnlpr>   	_analyzer_anonymizer)selfr4   r5   r6   r7   r8   r   r   r   providerr?   r    r(   r!   __init__j   s4    

	

zPresidioAnonymizerBase.__init__r   None)
recognizerr   c                 C  s    | j j| | j|j dS )zsAdd a recognizer to the analyzer

        Args:
            recognizer: Recognizer to add to the analyzer.
        N)rE   registryadd_recognizerr4   extendsupported_entities)rG   rK   r    r    r!   rM      s    z%PresidioAnonymizerBase.add_recognizerzDict[str, OperatorConfig])r5   r   c                 C  s   | j | dS )zrAdd operators to the anonymizer

        Args:
            operators: Operators to add to the anonymizer.
        N)r5   update)rG   r5   r    r    r!   rD      s    z$PresidioAnonymizerBase.add_operators)NNNTN)__name__
__module____qualname____doc__rI   rM   rD   r    r    r    r!   r-   d   s        ?	r-   c                   @  s(   e Zd ZdZd
dddddddd	ZdS )PresidioAnonymizerz$Anonymizer using Microsoft Presidio.NstrOptional[str]r.   $Optional[ConflictResolutionStrategy]textlanguage
allow_listconflict_resolutionr   c                 C  s   |du r| j d }n"|| j vr6td| d| j  dg }| j|D ],}| }|d|v rh|d gn|d  qFtt|t| j	}| jj
||||d}	| j|	|}
| jj||	| jd	}t||
|}t||S )
a8  Anonymize text.
        Each PII entity is replaced with a fake value.
        Each time fake values will be different, as they are generated randomly.

        PresidioAnonymizer has no built-in memory -
        so it will not remember the effects of anonymizing previous texts.
        >>> anonymizer = PresidioAnonymizer()
        >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!")
        'My name is Noah Rhodes. Hi Noah Rhodes!'
        >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!")
        'My name is Brett Russell. Hi Brett Russell!'

        Args:
            text: text to anonymize
            language: language to use for analysis of PII
                If None, the first (main) language in the list
                of languages specified in the configuration will be used.
        Nr   
Language '-' is not supported. Supported languages are: @. Change your language configuration file to add more languages.supported_entityrO   entitiesr[   r\   analyzer_resultsr5   )r>   
ValueErrorrE   get_recognizersto_dictrN   rA   setintersectionr4   analyzerF   0_remove_conflicts_and_get_text_manipulation_data	anonymizer5   r   r   )rG   rZ   r[   r\   r]   rO   rK   recognizer_dictentities_to_analyzere   filtered_analyzer_resultsanonymizer_resultsanonymizer_mappingr    r    r!   
_anonymize   sP    
	zPresidioAnonymizer._anonymize)NNN)rQ   rR   rS   rT   rs   r    r    r    r!   rU      s
      rU   c                      s   e Zd ZdZd%dddddd	 fd
dZeddddZeddddZd&ddddddddZe	fddddddZ
ddddZddd d!d"Zddd d#d$Z  ZS )'PresidioReversibleAnonymizerz/Reversible Anonymizer using Microsoft Presidio.NTr.   r/   r0   r1   r2   r3   c                   s,   |d u rt }t ||||| t | _d S )N)r@   superrI   r   _deanonymizer_mapping)rG   r4   r5   r6   r7   r8   	__class__r    r!   rI     s    z%PresidioReversibleAnonymizer.__init__r   r   c                 C  s   | j jS )zReturn the deanonymizer mapping)rv   datarG   r    r    r!   deanonymizer_mapping!  s    z1PresidioReversibleAnonymizer.deanonymizer_mappingc                 C  s   dd | j  D S )zcReturn the anonymizer mapping
        This is just the reverse version of the deanonymizer mapping.c                 S  s$   i | ]\}}|d d |  D qS )c                 S  s   i | ]\}}||qS r    r    )r;   kvr    r    r!   r=   +      zNPresidioReversibleAnonymizer.anonymizer_mapping.<locals>.<dictcomp>.<dictcomp>)rC   )r;   keyZ
inner_dictr    r    r!   r=   *  s   zCPresidioReversibleAnonymizer.anonymizer_mapping.<locals>.<dictcomp>)r{   rC   rz   r    r    r!   rr   &  s    z/PresidioReversibleAnonymizer.anonymizer_mappingrV   rW   rX   rY   c                 C  s   |du r| j d }|| j vr4td| d| j  dg }| j|D ],}| }|d|v rf|d gn|d  qDtt|t| j	}| jj
||||d}	| j|	|}
| jj||	| jd	}t||
|d
d}| j| t|| jS )a  Anonymize text.
        Each PII entity is replaced with a fake value.
        Each time fake values will be different, as they are generated randomly.
        At the same time, we will create a mapping from each anonymized entity
        back to its original text value.

        Thanks to the built-in memory, all previously anonymised entities
        will be remembered and replaced by the same fake values:
        >>> anonymizer = PresidioReversibleAnonymizer()
        >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!")
        'My name is Noah Rhodes. Hi Noah Rhodes!'
        >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!")
        'My name is Noah Rhodes. Hi Noah Rhodes!'

        Args:
            text: text to anonymize
            language: language to use for analysis of PII
                If None, the first (main) language in the list
                of languages specified in the configuration will be used.
        Nr   r^   r_   r`   ra   rO   rb   rd   T)is_reversed)r>   rf   rE   rg   rh   rN   rA   ri   rj   r4   rk   rF   rl   rm   r5   r   rv   rP   r   rr   )rG   rZ   r[   r\   r]   rO   rK   rn   ro   re   rp   rq   Znew_deanonymizer_mappingr    r    r!   rs   /  sT    

	z'PresidioReversibleAnonymizer._anonymizez%Callable[[str, MappingDataType], str])text_to_deanonymizedeanonymizer_matching_strategyr   c                 C  s    | j stdd||| j}|S )a  Deanonymize text.
        Each anonymized entity is replaced with its original value.
        This method exploits the mapping created during the anonymization process.

        Args:
            text_to_deanonymize: text to deanonymize
            deanonymizer_matching_strategy: function to use to match
                anonymized entities with their original values and replace them.
        zDeanonymizer mapping is empty.z6Please call anonymize() and anonymize some text first.)rv   rf   r{   )rG   r   r   r    r    r!   _deanonymize  s    z)PresidioReversibleAnonymizer._deanonymizerJ   c                 C  s   t  | _dS )zReset the deanonymizer mappingN)r   rv   rz   r    r    r!   reset_deanonymizer_mapping  s    z7PresidioReversibleAnonymizer.reset_deanonymizer_mappingzUnion[Path, str])	file_pathr   c                 C  s   t |}|jdvr t| d|jjddd |jdkrxt|d"}tj| j|dd W d	   q1 sl0    Y  nH|j	d
rt|d"}t
j| j|dd W d	   n1 s0    Y  d	S )a  Save the deanonymizer mapping to a JSON or YAML file.

        Args:
            file_path: Path to file to save the mapping to.

        Example:
        .. code-block:: python

            anonymizer.save_deanonymizer_mapping(file_path="path/mapping.json")
        .json.yaml) must have an extension of .json or .yamlT)parentsexist_okr   w   )indentNr   z.ymlF)Zdefault_flow_style)r   suffixrf   parentmkdiropenjsondumpr{   endswithyaml)rG   r   Z	save_pathfr    r    r!   save_deanonymizer_mapping  s    

2z6PresidioReversibleAnonymizer.save_deanonymizer_mappingc                 C  s   t |}|jdvr t| d|jdkr`t|d}t|}W d   q1 sT0    Y  nF|jdrt|d }tj|tjd}W d   n1 s0    Y  | j	
| dS )a  Load the deanonymizer mapping from a JSON or YAML file.

        Args:
            file_path: Path to file to load the mapping from.

        Example:
        .. code-block:: python

            anonymizer.load_deanonymizer_mapping(file_path="path/mapping.json")
        r   r   r   rNr   )Loader)r   r   rf   r   r   loadr   r   Z
FullLoaderrv   rP   )rG   r   Z	load_pathr   Zloaded_mappingr    r    r!   load_deanonymizer_mapping  s    

*.z6PresidioReversibleAnonymizer.load_deanonymizer_mapping)NNNTN)NNN)rQ   rR   rS   rT   rI   propertyr{   rr   rs   r
   r   r   r   r   __classcell__r    r    rw   r!   rt     s(           Wrt   )+
__future__r   r   pathlibr   typingr   r   r   r   r   r	   r   Z+langchain_experimental.data_anonymizer.baser
   r   r   Z;langchain_experimental.data_anonymizer.deanonymizer_mappingr   r   r   ZGlangchain_experimental.data_anonymizer.deanonymizer_matching_strategiesr   Z=langchain_experimental.data_anonymizer.faker_presidio_mappingr   r   r   r   r#   r   r&   r   r)   r   r   r"   r$   r'   r*   r@   r-   rU   rt   r    r    r    r!   <module>   s0    WP