U
    ~fhi                     @  s(  d Z ddlmZ ddlmZ ddlmZmZ ddlZddlm	Z	 ddl
ZddlmZmZmZ ddlmZmZ ddlZdd	lmZmZ dd
lmZ ddlmZmZmZmZmZ ddlm Z m!Z! ddl"m#  m$  m%Z& ddl'm(Z( dddddZ)ddddddZ*G dd dZ+G dd de(ej,Z-dS )a  
Read SAS7BDAT files

Based on code written by Jared Hobbs:
  https://bitbucket.org/jaredhobbs/sas7bdat

See also:
  https://github.com/BioStatMatt/sas7bdat

Partial documentation of the file format:
  https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf

Reference for binary data compression:
  http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
    )annotations)abc)datetime	timedeltaN)cast)CompressionOptionsFilePath
ReadBuffer)EmptyDataErrorOutOfBoundsDatetime)	DataFrameisna)
get_handle)read_double_with_byteswapread_float_with_byteswapread_uint16_with_byteswapread_uint32_with_byteswapread_uint64_with_byteswap)Parserget_subheader_index)
ReaderBasefloatstrZsas_datetimeunitc                 C  sV   t | rtjS |dkr,tdddt| d S |dkrJtdddt| d S tdd S )Ns     )secondsd)dayszunit must be 'd' or 's')r   pdZNaTr   r   
ValueErrorr    r#   :/tmp/pip-unpacked-wheel-a5acpmi0/pandas/io/sas/sas7bdat.py_parse_datetime<   s    r%   z	pd.Series)sas_datetimesr   returnc                 C  sJ   zt j| |ddW S  tk
rD   | jt|d}tt j|}| Y S X dS )a  
    Convert to Timestamp if possible, otherwise to datetime.datetime.
    SAS float64 lacks precision for more than ms resolution so the fit
    to datetime.datetime is ok.

    Parameters
    ----------
    sas_datetimes : {Series, Sequence[float]}
       Dates or datetimes in SAS
    unit : {str}
       "d" if the floats represent dates, "s" for datetimes

    Returns
    -------
    Series
       Series of datetime64 dtype or datetime.datetime.
    z
1960-01-01)r   originr   N)r!   Zto_datetimer   applyr%   r   Series)r&   r   Zs_seriesr#   r#   r$   _convert_datetimesJ   s    r,   c                   @  sX   e Zd ZU ded< ded< ded< ded< ded< ded	< ddddddd
dddZdS )_Columnintcol_idstr | bytesnamelabelformatbytesctypelengthNone)r/   r1   r2   r3   r5   r6   r'   c                 C  s(   || _ || _|| _|| _|| _|| _d S N)r/   r1   r2   r3   r5   r6   )selfr/   r1   r2   r3   r5   r6   r#   r#   r$   __init__l   s    
z_Column.__init__N)__name__
__module____qualname____annotations__r:   r#   r#   r#   r$   r-   d   s   
r-   c                   @  s  e Zd ZU dZded< ded< dSd	d
d
ddd
d
ddd	ddZddddZddddZddddZddddZ	ddddZ
dddd Zddd!d"d#Zdddd$d%d&Zddd'd(d)Zddd*d+d,d-Zddd.d/Zd
dd0d1Zddd2d3Zddd4d5Zdddd+d6d7Zdddd+d8d9Zdddd+d:d;Zdddd+d<d=Zdddd+d>d?Zdddd+d@dAZdddd+dBdCZdddd+dDdEZdTdddFdGdHZdIdJ ZdddKdLZdMdN ZdOd*dPdQdRZ dS )USAS7BDATReadera  
    Read SAS files in SAS7BDAT format.

    Parameters
    ----------
    path_or_buf : path name or buffer
        Name of SAS file or file-like object pointing to SAS file
        contents.
    index : column identifier, defaults to None
        Column to use as index.
    convert_dates : bool, defaults to True
        Attempt to convert dates to Pandas datetime values.  Note that
        some rarely used SAS date formats may be unsupported.
    blank_missing : bool, defaults to True
        Convert empty strings to missing values (SAS uses blanks to
        indicate missing character variables).
    chunksize : int, defaults to None
        Return SAS7BDATReader object for iterations, returns chunks
        with given number of lines.
    encoding : str, 'infer', defaults to None
        String encoding acc. to Python standard encodings,
        encoding='infer' tries to detect the encoding from the file header,
        encoding=None will leave the data in binary format.
    convert_text : bool, defaults to True
        If False, text variables are left as raw bytes.
    convert_header_text : bool, defaults to True
        If False, header text, including column names, are left as raw
        bytes.
    r.   _int_lengthzbytes | None_cached_pageNTinferzFilePath | ReadBuffer[bytes]boolz
int | Nonez
str | Noner   r7   )	path_or_bufconvert_datesblank_missing	chunksizeencodingconvert_textconvert_header_textcompressionr'   c
           
   	   C  s   || _ || _|| _|| _|| _|| _|| _d| _d| _g | _	g | _
g | _g | _g | _d | _g | _g | _g | _d| _d| _d| _t|dd|	d| _| jj| _| j| j| j| j| j| j| j| jd g	| _ z| !  | "  W n t#k
r   | $   Y nX d S )Nzlatin-1    r   rbF)Zis_textrK   )%indexrE   rF   rG   rH   rI   rJ   default_encodingrK   column_names_rawcolumn_namescolumn_formatscolumns%_current_page_data_subheader_pointersrA   _column_data_lengths_column_data_offsets_column_types_current_row_in_file_indexZ_current_row_on_page_indexr   handleshandle_path_or_buf_process_rowsize_subheader_process_columnsize_subheader_process_subheader_counts_process_columntext_subheader_process_columnname_subheader#_process_columnattributes_subheader_process_format_subheader_process_columnlist_subheader_subheader_processors_get_properties_parse_metadata	Exceptionclose)
r9   rD   rN   rE   rF   rG   rH   rI   rJ   rK   r#   r#   r$   r:      sX       
zSAS7BDATReader.__init__z
np.ndarray)r'   c                 C  s   t j| jt jdS )z5Return a numpy int64 array of the column data lengthsdtype)npasarrayrU   int64r9   r#   r#   r$   column_data_lengths   s    z"SAS7BDATReader.column_data_lengthsc                 C  s   t j| jt jdS )z0Return a numpy int64 array of the column offsetsri   )rk   rl   rV   rm   rn   r#   r#   r$   column_data_offsets   s    z"SAS7BDATReader.column_data_offsetsc                 C  s   t j| jt ddS )zj
        Returns a numpy character array of the column types:
           s (string) or d (double)
        ZS1ri   )rk   rl   rW   rj   rn   r#   r#   r$   column_types   s    zSAS7BDATReader.column_typesc                 C  s   | j   d S r8   )rY   rh   rn   r#   r#   r$   rh      s    zSAS7BDATReader.closec                 C  s
  | j d | j d| _| jdttj tjkr<td| tj	tj
}|tjkrtd| _d| _tj| _tj| _nd| _tj| _tj| _d| _| tjtj}|tjkrtj}nd}| tjtj}|dkrd	| _tjd
k| _nd| _tjdk| _| tjtjd }|tj kr6tj | | _!| j"dkrD| j!| _"nd| d| _!t#ddd}| $tj%| tj&}|t'j(|dd | _)| $tj*| tj+}|t'j(|dd | _,| -tj.| tj/| _0| j | j0d }|  j|7  _t| j| j0krtd| -tj1| tj2| _3d S )Nr   i   z'magic number mismatch (not a SAS file?)T   F      <big>littlerB   zunknown (code=)r   r   r   r)   z*The SAS7BDAT file appears to be truncated.)4r[   seekreadrA   lenconstmagicr"   _read_bytesZalign_1_offsetZalign_1_lengthZu64_byte_checker_valueU64r@   Zpage_bit_offset_x64_page_bit_offsetZsubheader_pointer_length_x64_subheader_pointer_lengthZpage_bit_offset_x86Zsubheader_pointer_length_x86Zalign_2_offsetZalign_2_lengthZalign_1_checker_valueZalign_2_valueZendianness_offsetZendianness_length
byte_ordersys	byteorderneed_byteswapZencoding_offsetZencoding_lengthZencoding_namesZinferred_encodingrH   r   _read_floatZdate_created_offsetZdate_created_lengthr!   Zto_timedeltaZdate_createdZdate_modified_offsetZdate_modified_lengthZdate_modified
_read_uintZheader_size_offsetZheader_size_lengthheader_lengthZpage_size_offsetZpage_size_length_page_length)r9   bufZalign1epochxr#   r#   r$   re      sj    



    zSAS7BDATReader._get_propertiesr   c                 C  s(   | j | jpdd}|jr$|   t|S )Nr   )nrows)r{   rG   emptyrh   StopIteration)r9   dar#   r#   r$   __next__8  s
    zSAS7BDATReader.__next__)offsetwidthc                 C  sR   | j d k	st|dkr&t| j || jS |dkr>t| j || jS |   tdd S )Nrs   rr   zinvalid float width)rA   AssertionErrorr   r   r   rh   r"   r9   r   r   r#   r#   r$   r   @  s        zSAS7BDATReader._read_float)r   r   r'   c                 C  s   | j d k	st|dkr&| |dd S |dkr>t| j || jS |dkrVt| j || jS |dkrnt| j || jS |   tdd S )Nr   r      rs   rr   zinvalid int width)	rA   r   r   r   r   r   r   rh   r"   r   r#   r#   r$   r   O  s.          zSAS7BDATReader._read_uint)r   r6   c                 C  sB   | j d k	st|| t| j kr0|   td| j |||  S )NzThe cached page is too small.)rA   r   r|   rh   r"   r9   r   r6   r#   r#   r$   r   c  s
    zSAS7BDATReader._read_bytesr0   )r   r6   r'   c                 C  s   |  | ||dS )N     )_convert_header_textr   rstripr   r#   r#   r$   _read_and_convert_header_textj  s    z,SAS7BDATReader._read_and_convert_header_textc                 C  sN   d}|sJ| j | j| _t| jdkr(qJt| j| jkr@td|  }qd S )NFr   z2Failed to read a meta data page from the SAS file.)r[   r{   r   rA   r|   r"   _process_page_meta)r9   doner#   r#   r$   rf   o  s    zSAS7BDATReader._parse_metadatac                 C  sZ   |    tjtjtjg }| j|kr,|   | jtjk}| jtjk}t|pV|pV| j	g kS r8   )
_read_page_headerr}   page_meta_typesZpage_amd_typepage_mix_type_current_page_type_process_page_metadatapage_data_typerC   rT   )r9   ptZis_data_pageZis_mix_pager#   r#   r$   r   y  s    
z!SAS7BDATReader._process_page_metac                 C  s^   | j }tj| }| |tjtj@ | _tj| }| |tj| _	tj
| }| |tj| _d S r8   )r   r}   Zpage_type_offsetr   Zpage_type_lengthZpage_type_mask2r   Zblock_count_offsetZblock_count_lengthZ_current_page_block_countZsubheader_count_offsetZsubheader_count_length_current_page_subheaders_count)r9   
bit_offsetZtxr#   r#   r$   r     s    


 z SAS7BDATReader._read_page_headerc                 C  s  | j }t| jD ]}tj| }|| j|  }| || j}|| j7 }| || j}|| j7 }| |d}|d7 }| |d}|dks|tjkrq| 	|| j}	t
|	}
| j|
 }|d kr|tjdfk}|tjk}| jr|r|r| j||f n|   td|	 q||| qd S )Nr   r   zUnknown subheader signature )r   ranger   r}   Zsubheader_pointers_offsetr   r   r@   Ztruncated_subheader_idr   r   rd   Zcompressed_subheader_idZcompressed_subheader_typerK   rT   appendrh   r"   )r9   r   ir   Ztotal_offsetZsubheader_offsetZsubheader_lengthZsubheader_compressionZsubheader_typeZsubheader_signatureZsubheader_indexZsubheader_processorf1f2r#   r#   r$   r     s>    





z%SAS7BDATReader._process_page_metadatac                 C  s   | j }|}|}| jr&|d7 }|d7 }n|d7 }|d7 }| |tj|  || _| |tj|  || _| |tj|  || _	| |tj
|  || _tj| }| || || _| |d| _| |d| _d S )Ni  i  ib  iz  r   )r@   r   r   r}   Zrow_length_offset_multiplierZ
row_lengthZrow_count_offset_multiplier	row_countZcol_count_p1_multipliercol_count_p1Zcol_count_p2_multipliercol_count_p2Z'row_count_on_mix_page_offset_multiplierZ_mix_page_row_count_lcs_lcp)r9   r   r6   int_lenZ
lcs_offsetZ
lcp_offsetZmxr#   r#   r$   r\     s8    
  
z)SAS7BDATReader._process_rowsize_subheaderc                 C  sT   | j }||7 }| ||| _| j| j | jkrPtd| j d| j d| j d d S )Nz Warning: column count mismatch (z + z != z)
)r@   r   column_countr   r   print)r9   r   r6   r   r#   r#   r$   r]     s    z,SAS7BDATReader._process_columnsize_subheaderc                 C  s   d S r8   r#   r   r#   r#   r$   r^     s    z(SAS7BDATReader._process_subheader_countsc           	      C  s  || j 7 }| |tj}| ||}|d| d}| j| t| jdkrd}tj	D ]}||kr\|}q\|| _
|| j 8 }|d }| jr|d7 }| || j}|d}|dkrd| _|d }| jr|d7 }| || j}|d| j | _n|tjkr4|d	 }| jr|d7 }| || j}|d| j | _nH| jdkr|d| _|d }| jr^|d7 }| || j}|d| j | _t| d
r| | j| _d S )Nr   r   r   rL      rs           (   creator_proc)r@   r   r}   Ztext_block_size_lengthr   r   rP   r   r|   Zcompression_literalsrK   r   r   r   r   Zrle_compressionhasattrr   )	r9   r   r6   Ztext_block_sizer   Z	cname_rawZcompression_literalZclZoffset1r#   r#   r$   r_     sN    



z,SAS7BDATReader._process_columntext_subheaderc                 C  s   | j }||7 }|d|  d d }t|D ]}|tj|d   tj }|tj|d   tj }|tj|d   tj }| |tj}	| |tj	}
| |tj
}| j|	 }||
|
|  }| j| | q*d S )Nr      rr   r   )r@   r   r}   Zcolumn_name_pointer_lengthZ!column_name_text_subheader_offsetZcolumn_name_offset_offsetZcolumn_name_length_offsetr   Z!column_name_text_subheader_lengthZcolumn_name_offset_lengthZcolumn_name_length_lengthrP   rQ   r   r   )r9   r   r6   r   Zcolumn_name_pointers_countr   Ztext_subheaderZcol_name_offsetZcol_name_lengthidx
col_offsetZcol_lenZname_rawcnamer#   r#   r$   r`     sD      
z,SAS7BDATReader._process_columnname_subheaderc           
      C  s   | j }|d|  d |d  }t|D ]}|| tj ||d   }|d|  tj ||d   }|d|  tj ||d   }| ||}	| j|	 | |tj	}	| j
|	 | |tj}	| j|	dkrdnd q&d S )Nr   r   rr   r      d   s)r@   r   r}   Zcolumn_data_offset_offsetZcolumn_data_length_offsetZcolumn_type_offsetr   rV   r   Zcolumn_data_length_lengthrU   Zcolumn_type_lengthrW   )
r9   r   r6   r   Zcolumn_attributes_vectors_countr   Zcol_data_offsetZcol_data_lenZ	col_typesr   r#   r#   r$   ra   8  s*    
z2SAS7BDATReader._process_columnattributes_subheaderc                 C  s   d S r8   r#   r   r#   r#   r$   rc   R  s    z,SAS7BDATReader._process_columnlist_subheaderc                 C  sx  | j }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }	| |tj	}
t
|
t| jd }| |tj}| |tj}| |tj}t
|t| jd }| |tj}| |	tj}| j| }| ||||  }| j| }| ||||  }t| j}t|| j| ||| j| | j| }| j| | j| d S )N   r   )r@   r}   Z)column_format_text_subheader_index_offsetZcolumn_format_offset_offsetZcolumn_format_length_offsetZ(column_label_text_subheader_index_offsetZcolumn_label_offset_offsetZcolumn_label_length_offsetr   Z)column_format_text_subheader_index_lengthminr|   rP   Zcolumn_format_offset_lengthZcolumn_format_length_lengthZ(column_label_text_subheader_index_lengthZcolumn_label_offset_lengthZcolumn_label_length_lengthr   rS   r-   rQ   rW   rU   rR   r   )r9   r   r6   r   Ztext_subheader_formatZcol_format_offsetZcol_format_lenZtext_subheader_labelZcol_label_offsetZcol_label_lenr   Z
format_idxZformat_startZ
format_lenZ	label_idxZlabel_startZ	label_lenZlabel_namesZcolumn_labelZformat_namesZcolumn_formatZcurrent_column_numbercolr#   r#   r$   rb   V  s`        


	z(SAS7BDATReader._process_format_subheader)r   r'   c                 C  s   |d kr| j d k	r| j }n|d kr(| j}t| jdkrF|   td|dkr`| j| jkr`t S t|| j| j }| j	d}| j	d}t
j||ftd| _t
j|d| ft
jd| _d| _t| }|| |  }| jd k	r|| j}|S )Nr   zNo columns to parse from filer   r   ri   rr   )rG   r   r|   rW   rh   r
   rX   r   r   countrk   r   object_string_chunkzerosZuint8_byte_chunk_current_row_in_chunk_indexr   r{   _chunk_to_dataframerN   Z	set_index)r9   r   Zndnsprsltr#   r#   r$   r{     s*    

zSAS7BDATReader.readc                 C  s   g | _ | j| j| _t| jdkr(dS t| j| jkrf|   dt| jdd| jdd}t||   | j	t
jkr|   | j	t
jt
jt
jg kr|  S dS )Nr   Tz-failed to read complete page from file (read r   z of z bytes)F)rT   r[   r{   r   rA   r|   rh   r"   r   r   r}   r   r   r   r   _read_next_page)r9   msgr#   r#   r$   r     s$    zSAS7BDATReader._read_next_pagec                 C  st  | j }| j}t|| |}i }d\}}t| jD ]*}| j| }| j| dkr| j|d d f j| jd d}	t	j
|	tj|d||< | jr| j| tjkrt|| d||< n"| j| tjkrt|| d||< |d7 }q0| j| dkr<t	j
| j|d d f |d	||< | jr2| jd k	r2| || j||< |d7 }q0|   td
t| j|  q0t|| j|dd}
|
S )N)r   r   r   r   ri   )rj   rN   r   r   r   )rN   zunknown column type F)rS   rN   copy)r   rX   r   r   rQ   rW   r   viewr   r!   r+   rk   Zfloat64rE   rR   r}   Zsas_date_formatsr,   Zsas_datetime_formatsr   rI   rH   _decode_stringr   rh   r"   reprr   )r9   nmixr   ZjsZjbjr1   Zcol_arrZdfr#   r#   r$   r     s2    
 
 
z"SAS7BDATReader._chunk_to_dataframec                 C  s   | | jp| jS r8   )decoderH   rO   r9   br#   r#   r$   r     s    zSAS7BDATReader._decode_stringr4   )r   r'   c                 C  s   | j r| |S |S d S r8   )rJ   r   r   r#   r#   r$   r     s    
z#SAS7BDATReader._convert_header_text)NTTNNTTrB   )N)!r;   r<   r=   __doc__r>   r:   ro   rp   rq   rh   re   r   r   r   r   r   rf   r   r   r   r\   r]   r^   r_   r`   ra   rc   rb   r{   r   r   r   r   r#   r#   r#   r$   r?      sN   
         ?F
+-!7r?   ).r   
__future__r   collectionsr   r   r   r   typingr   Znumpyrk   Zpandas._typingr   r   r	   Zpandas.errorsr
   r   Zpandasr!   r   r   Zpandas.io.commonr   Zpandas.io.sas._byteswapr   r   r   r   r   Zpandas.io.sas._sasr   r   Zpandas.io.sas.sas_constantsioZsasZsas_constantsr}   Zpandas.io.sas.sasreaderr   r%   r,   r-   Iteratorr?   r#   r#   r#   r$   <module>   s&   