a
    bg?H                     @  s  d Z ddlmZ ddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ eeZdgZ eG dd dZ!ddhZ"h dZ#dddddZ$G dd deZ%G dd deZ&eG dd deZ'dS )zLoads YouTube transcript.    )annotationsN)Enum)Path)AnyDict	GeneratorListOptionalSequenceUnion)parse_qsurlparse)
ParseError)Document)model_validator)	dataclass)
BaseLoaderz0https://www.googleapis.com/auth/youtube.readonlyc                   @  s   e Zd ZU dZe d d Zded< e d d Zded< e d d Z	ded< d	d
ddZ
eddedddddZdd
ddZdS )GoogleApiClienta  Generic Google API Client.

    To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google``
    python package installed.
    As the google api expects credentials you need to set up a google account and
    register your Service. "https://developers.google.com/docs/api/quickstart/python"

    *Security Note*: Note that parsing of the transcripts relies on the standard
        xml library but the input is viewed as trusted in this case.


    Example:
        .. code-block:: python

            from langchain_community.document_loaders import GoogleApiClient
            google_api_client = GoogleApiClient(
                service_account_path=Path("path_to_your_sec_file.json")
            )

    z.credentialszcredentials.jsonr   credentials_pathservice_account_pathz
token.json
token_pathNonereturnc                 C  s   |   | _d S N)_load_credentialscredsself r   z/var/www/html/cobodadashboardai.evdpl.com/venv/lib/python3.9/site-packages/langchain_community/document_loaders/youtube.py__post_init__2   s    zGoogleApiClient.__post_init__beforemoder   valuesr   c                 C  s&   |j ds |j ds td|j S )DValidate that either folder_id or document_ids is set, but not both.r   r   -Must specify either channel_name or video_ids)kwargsget
ValueErrorclsr&   r   r   r    #validate_channel_or_videoIds_is_set5   s
    z3GoogleApiClient.validate_channel_or_videoIds_is_setc           	      C  s   z@ddl m} ddlm} ddlm} ddlm} ddlm	} W n t
yZ   t
dY n0 d}| j r||jt| jS | j r|t| jt}|r|js|r|jr|jr||  n|t| jt}|jdd	}t| jd
}||  W d   n1 s0    Y  |S )zLoad credentials.r   )Request)service_account)Credentials)InstalledAppFlowYouTubeTranscriptApiYou must run`pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib youtube-transcript-api` to use the Google Drive loaderN)portw)Zgoogle.auth.transport.requestsr/   Zgoogle.oauth2r0   Zgoogle.oauth2.credentialsr1   Zgoogle_auth_oauthlib.flowr2   youtube_transcript_apir4   ImportErrorr   existsZfrom_service_account_filestrr   Zfrom_authorized_user_fileSCOPESZvalidZexpiredZrefresh_tokenrefreshZfrom_client_secrets_filer   Zrun_local_serveropenwriteto_json)	r   r/   r0   r1   r2   r4   r   Zflowtokenr   r   r    r   @   s6    
	


.z!GoogleApiClient._load_credentialsN)__name__
__module____qualname____doc__r   homer   __annotations__r   r   r!   r   classmethodr.   r   r   r   r   r    r      s   
	r   httphttps>   zyoutu.bezwww.youtube-nocookie.comzyoutube.comzm.youtube.comzvid.pluszwww.youtube.comr;   Optional[str])urlr   c                 C  s   t | }|jtvrdS |jtvr$dS |j}|drp|j}t|}d|v rj|d }t	|t
r`|n|d }qdS n|jd}|dd }t|dkrdS |S )zEParse a YouTube URL and return the video ID if valid, otherwise None.Nz/watchvr   /   )r   schemeALLOWED_SCHEMESnetlocALLOWED_NETLOCSpathendswithqueryr   
isinstancer;   lstripsplitlen)rL   
parsed_urlrU   rW   Zparsed_queryZidsvideo_idr   r   r    _parse_video_idt   s$    


r^   c                   @  s   e Zd ZdZdZdZdZdS )TranscriptFormatz3Output formats of transcripts from `YoutubeLoader`.textlineschunksN)rB   rC   rD   rE   TEXTLINESCHUNKSr   r   r   r    r_      s   r_   c                	   @  s   e Zd ZdZdddejddfdddd	d
dddddZedddddZe	ddd dddZ
ddddddZdddddZdd d!d"Zd#d d$d%ZdS )&YoutubeLoaderz!Load `YouTube` video transcripts.FenNx   r;   boolzUnion[str, Sequence[str]]rK   r_   int)r]   add_video_infolanguagetranslationtranscript_formatcontinue_on_failurechunk_size_secondsc                 C  sR   || _ d|i| _|| _|| _t|tr0|g| _n|| _|| _|| _|| _|| _	dS )z!Initialize with YouTube video ID.sourceN)
r]   	_metadatark   rl   rX   r;   rm   rn   ro   rp   )r   r]   rk   rl   rm   rn   ro   rp   r   r   r    __init__   s    


zYoutubeLoader.__init__)youtube_urlr   c                 C  s    t | }|std|  d|S )z*Extract video ID from common YouTube URLs.z.Could not determine the video ID for the URL "z".)r^   r+   )rt   r]   r   r   r    extract_video_id   s    
zYoutubeLoader.extract_video_idr   )rt   r)   r   c                 K  s   |  |}| |fi |S )z|Given a YouTube URL, construct a loader.
        See `YoutubeLoader()` constructor for a list of keyword arguments.
        )ru   )r-   rt   r)   r]   r   r   r    from_youtube_url   s    
zYoutubeLoader.from_youtube_urlz
List[Dict]r   )chunk_pieceschunk_start_secondsr   c              
   C  sp   t |d\}}t |d\}}tdtdd |i | j||dd|dd|dd| j d| d	d
dS )z0Create Document from chunk of transcript pieces.<    c                 S  s   | d  dS Nr`   rz   strip)Zchunk_piecer   r   r    <lambda>       z4YoutubeLoader._make_chunk_document.<locals>.<lambda>02d: https://www.youtube.com/watch?v=z&t=s)Zstart_secondsZstart_timestamprq   page_contentmetadata)divmodr   joinmaprr   r]   )r   rw   rx   mr   hr   r   r    _make_chunk_document   s     
z"YoutubeLoader._make_chunk_documentzGenerator[Document, None, None])transcript_piecesr   c                 c  s|   g }d}| j }|D ]J}|d |d  }||krR|r@| ||V  g }|}|| j 7 }|| qt|dkrx| ||V  d S )Nr   startduration)rp   r   appendr[   )r   r   rw   rx   Zchunk_time_limittranscript_pieceZ	piece_endr   r   r    _get_transcript_chunks   s    
z$YoutubeLoader._get_transcript_chunksList[Document]r   c                 C  s<  zddl m}m}m} W n ty2   tdY n0 | jrN|  }| j| z|	| j
}W n |yt   g  Y S 0 z|| j}W n |y   |dg}Y n0 | jdur|| j}| }| jtjkrdtdd |}t|| jd	gS | jtjkrttd
d |S | jtjkr0t| |S tddS )z1Load YouTube transcripts into `Document` objects.r   )NoTranscriptFoundTranscriptsDisabledr4   zvCould not import "youtube_transcript_api" Python package. Please install it with `pip install youtube-transcript-api`.rg   Nrz   c                 S  s   | d  dS r{   r|   r   r   r   r    r~     r   z$YoutubeLoader.load.<locals>.<lambda>r   c                 S  s(   t | d dttdd |  dS )Nr`   rz   c                 S  s   | d dkS )Nr   r`   r   )itemr   r   r    r~   "  r   z6YoutubeLoader.load.<locals>.<lambda>.<locals>.<lambda>r   )r   r}   dictfilteritemsr   r   r   r    r~     s   zUnknown transcript format.)r8   r   r   r4   r9   rk   _get_video_inforr   updatelist_transcriptsr]   find_transcriptrl   rm   	translatefetchrn   r_   rc   r   r   r   rd   listre   r   r+   )r   r   r   r4   
video_infotranscript_list
transcriptr   r   r   r    load   sL    


zYoutubeLoader.loadr   c                 C  s   zddl m} W n ty*   tdY n0 |d| j }|jpDd|jpLd|jpTd|jp\d|jrn|j	dnd|j
pxd|jpdd}|S )zGet important video information.

        Components include:
            - title
            - description
            - thumbnail URL,
            - publish_date
            - channel author
            - and more.
        r   )YouTubezVCould not import "pytube" Python package. Please install it with `pip install pytube`.r   Unknownz%Y-%m-%d %H:%M:%S)titledescriptionZ
view_countthumbnail_urlpublish_datelengthauthor)Zpytuber   r9   r]   r   r   Zviewsr   r   strftimer   r   )r   r   Zytr   r   r   r    r   /  s$    
zYoutubeLoader._get_video_info)rB   rC   rD   rE   r_   rc   rs   staticmethodru   rH   rv   r   r   r   r   r   r   r   r    rf      s    	?rf   c                   @  s   e Zd ZU dZded< dZded< dZded< d	Zd
ed< dZded< dZ	d
ed< ddddZ
dddddZeddedddddZdddd d!Zddd"d#d$d%Zddd&d'd(Zddd)d*d+Zddd,d-d.d/Zd,dd0d1ZdS )2GoogleApiYoutubeLoadera  Load all Videos from a `YouTube` Channel.

    To use, you should have the ``googleapiclient,youtube_transcript_api``
    python package installed.
    As the service needs a google_api_client, you first have to initialize
    the GoogleApiClient.

    Additionally you have to either provide a channel name or a list of videoids
    "https://developers.google.com/docs/api/quickstart/python"



    Example:
        .. code-block:: python

            from langchain_community.document_loaders import GoogleApiClient
            from langchain_community.document_loaders import GoogleApiYoutubeLoader
            google_api_client = GoogleApiClient(
                service_account_path=Path("path_to_your_sec_file.json")
            )
            loader = GoogleApiYoutubeLoader(
                google_api_client=google_api_client,
                channel_name = "CodeAesthetic"
            )
            load.load()

    r   google_api_clientNrK   channel_namezOptional[List[str]]	video_idsTri   rk   rg   r;   captions_languageFro   r   r   c                 C  s   |  | jj| _d S r   )_build_youtube_clientr   r   youtube_clientr   r   r   r    r!   v  s    z$GoogleApiYoutubeLoader.__post_init__r   )r   r   c                 C  sF   zddl m} ddlm} W n ty6   tdY n0 |dd|dS )Nr   )buildr3   r5   ZyoutubeZv3)credentials)Zgoogleapiclient.discoveryr   r8   r4   r9   )r   r   r   r4   r   r   r    r   y  s    
	z,GoogleApiYoutubeLoader._build_youtube_clientr"   r#   zDict[str, Any]r%   c                 C  s    | ds| dstd|S )r'   r   r   r(   )r*   r+   r,   r   r   r    r.     s    z:GoogleApiYoutubeLoader.validate_channel_or_videoIds_is_set)r]   r   c                 C  st   ddl m}m} ||}z|| jg}W n* |yV   |D ]}|| j}q>q>Y n0 | }ddd |D S )Nr   )r   r4   rz   c                 S  s   g | ]}|d   dqS )r`   rz   r|   ).0tr   r   r    
<listcomp>  r   zGGoogleApiYoutubeLoader._get_transcripe_for_video_id.<locals>.<listcomp>)	r8   r   r4   r   r   r   r   r   r   )r   r]   r   r4   r   r   Zavailable_transcriptr   r   r   r    _get_transcripe_for_video_id  s    

z3GoogleApiYoutubeLoader._get_transcripe_for_video_idr   )r]   r)   r   c                 K  s8   |  |}| j jd|d }t||dd dS )N
id,snippetpartidr   r   r   )r   r   Zvideosr   executer   r*   )r   r]   r)   ZcaptionsZvideo_responser   r   r    _get_document_for_video_id  s    

z1GoogleApiYoutubeLoader._get_document_for_video_id)r   r   c                 C  s8   | j  jd|ddd}| }|d d d d }|S )Nr   channel   )r   qtype
maxResultsr   r   Z	channelId)r   searchr   r   )r   r   requestresponse
channel_idr   r   r    _get_channel_id  s    
z&GoogleApiYoutubeLoader._get_channel_id)r   r   c                 C  s4   | j  jd|d}| }|d d d d d S )NZcontentDetailsr   r   r   ZrelatedPlaylistsZuploads)r   Zchannelsr   r   )r   r   r   r   r   r   r    _get_uploads_playlist_id  s    
z/GoogleApiYoutubeLoader._get_uploads_playlist_idr   )r   r)   r   c                 K  sN  zddl m}m} W n ty.   tdY n0 | |}| |}| j jd|dd}g }|d urJ|	 }	|	d D ]}
|
d d	 d
 }d
|i}| j
r|
d d ||
d  z | |}|t||d W qx ||tfy2 } z<| jrtdd|
d d
  d|   n|W Y d }~qxd }~0 0 qx| j ||	}q^|S )Nr   )r   r   zTYou must run`pip install --upgrade youtube-transcript-api` to use the youtube loaderr   2   )r   Z
playlistIdr   r   ZsnippetZ
resourceIdZvideoIdZ
thumbnailsr   zError fetching transscript rz   r   z, exception: )r8   r   r   r9   r   r   r   ZplaylistItemsr   r   rk   popr   r   r   r   r   ro   loggererrorr   Z	list_next)r   r   r)   r   r   r   Zuploads_playlist_idr   r   r   r   r]   Z	meta_datar   er   r   r    _get_document_for_channel  sR    





z0GoogleApiYoutubeLoader._get_document_for_channelc                   sL   g } j r|  j  n* jr@| fdd jD  ntd|S )zLoad documents.c                   s   g | ]}  |qS r   )r   )r   r]   r   r   r    r     s   z/GoogleApiYoutubeLoader.load.<locals>.<listcomp>r(   )r   extendr   r   r+   )r   Zdocument_listr   r   r    r     s    
zGoogleApiYoutubeLoader.load)rB   rC   rD   rE   rG   r   r   rk   r   ro   r!   r   r   rH   r.   r   r   r   r   r   r   r   r   r   r    r   Q  s$   
5r   )(rE   
__future__r   loggingenumr   pathlibr   typingr   r   r   r   r	   r
   r   urllib.parser   r   Zxml.etree.ElementTreer   Zlangchain_core.documentsr   Zpydanticr   Zpydantic.dataclassesr   Z)langchain_community.document_loaders.baser   	getLoggerrB   r   r<   r   rR   rT   r^   r_   rf   r   r   r   r   r    <module>   s.   $
Q
 8