o
    it                     @  s  d Z ddlmZ ddlmZ ddlmZmZ ddlZddlm	Z	 ddl
ZddlmZmZmZ ddlmZmZ ddlZdd	lmZmZ dd
lmZ ddlmZ ddlm  m  mZ ddl m!Z! dddZ"dddZ#G dd dZ$G dd dZ%G dd de!ej&Z'dS ) a  
Read SAS7BDAT files

Based on code written by Jared Hobbs:
  https://bitbucket.org/jaredhobbs/sas7bdat

See also:
  https://github.com/BioStatMatt/sas7bdat

Partial documentation of the file format:
  https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf

Reference for binary data compression:
  http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
    )annotations)abc)datetime	timedeltaN)cast)CompressionOptionsFilePath
ReadBuffer)EmptyDataErrorOutOfBoundsDatetime)	DataFrameisna)
get_handle)Parser)
ReaderBasesas_datetimefloatunitstrc                 C  sR   t | rtjS |dkrtdddt| d S |dkr%tdddt| d S td)Ns     )secondsd)dayszunit must be 'd' or 's')r   pdNaTr   r   
ValueError)r   r    r   Q/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/pandas/io/sas/sas7bdat.py_parse_datetime2   s   r    sas_datetimes	pd.Seriesreturnc                 C  sD   z	t j| |ddW S  ty!   | jt|d}tt j|}| Y S w )a  
    Convert to Timestamp if possible, otherwise to datetime.datetime.
    SAS float64 lacks precision for more than ms resolution so the fit
    to datetime.datetime is ok.

    Parameters
    ----------
    sas_datetimes : {Series, Sequence[float]}
       Dates or datetimes in SAS
    unit : {str}
       "d" if the floats represent dates, "s" for datetimes

    Returns
    -------
    Series
       Series of datetime64 dtype or datetime.datetime.
    z
1960-01-01)r   originr   )r   to_datetimer   applyr    r   Series)r!   r   s_seriesr   r   r   _convert_datetimes@   s   r*   c                   @  s8   e Zd ZU ded< ded< ded< ded< ddd	Zd
S )_SubheaderPointerintoffsetlengthcompressionptyper#   Nonec                 C  s   || _ || _|| _|| _d S N)r-   r.   r/   r0   )selfr-   r.   r/   r0   r   r   r   __init__`   s   
z_SubheaderPointer.__init__N)
r-   r,   r.   r,   r/   r,   r0   r,   r#   r1   __name__
__module____qualname____annotations__r4   r   r   r   r   r+   Z   s   
 r+   c                   @  sH   e Zd ZU ded< ded< ded< ded< ded< ded	< dddZdS )_Columnr,   col_idstr | bytesnamelabelformatbytesctyper.   r#   r1   c                 C  s(   || _ || _|| _|| _|| _|| _d S r2   )r;   r=   r>   r?   rA   r.   )r3   r;   r=   r>   r?   rA   r.   r   r   r   r4   o   s   

z_Column.__init__N)r;   r,   r=   r<   r>   r<   r?   r<   rA   r@   r.   r,   r#   r1   r5   r   r   r   r   r:   g   s   
 r:   c                   @  sp  e Zd ZU dZded< ded< 								dfdgddZdhddZdhddZdhdd Zdid!d"Z	did#d$Z
djd&d'Zdkd*d+Zdld,d-Zdmd/d0Zdnd2d3Zdid4d5Zdod6d7Zd8d9 Zdid:d;Zdpd>d?ZdqdBdCZdrdDdEZdsdHdIZdtdJdKZdtdLdMZdtdNdOZdtdPdQZdtdRdSZdtdTdUZdtdVdWZdtdXdYZdudvd[d\Z d]d^ Z!djd_d`Z"dadb Z#dwdddeZ$dS )xSAS7BDATReadera  
    Read SAS files in SAS7BDAT format.

    Parameters
    ----------
    path_or_buf : path name or buffer
        Name of SAS file or file-like object pointing to SAS file
        contents.
    index : column identifier, defaults to None
        Column to use as index.
    convert_dates : bool, defaults to True
        Attempt to convert dates to Pandas datetime values.  Note that
        some rarely used SAS date formats may be unsupported.
    blank_missing : bool, defaults to True
        Convert empty strings to missing values (SAS uses blanks to
        indicate missing character variables).
    chunksize : int, defaults to None
        Return SAS7BDATReader object for iterations, returns chunks
        with given number of lines.
    encoding : string, defaults to None
        String encoding.
    convert_text : bool, defaults to True
        If False, text variables are left as raw bytes.
    convert_header_text : bool, defaults to True
        If False, header text, including column names, are left as raw
        bytes.
    r,   _int_lengthzbytes | None_cached_pageNTinferpath_or_bufFilePath | ReadBuffer[bytes]convert_datesboolblank_missing	chunksize
int | Noneencoding
str | Noneconvert_textconvert_header_textr/   r   r#   r1   c
           
      C  s   || _ || _|| _|| _|| _|| _|| _d| _d| _g | _	g | _
g | _g | _g | _d | _g | _g | _g | _d| _d| _d| _t|dd|	d| _| jj| _z|   |   W d S  tyd   |    w )Nzlatin-1    r   rbF)is_textr/   )indexrH   rJ   rK   rM   rO   rP   default_encodingr/   column_names_rawcolumn_namescolumn_formatscolumns%_current_page_data_subheader_pointersrD   _column_data_lengths_column_data_offsets_column_types_current_row_in_file_index_current_row_on_page_indexr   handleshandle_path_or_buf_get_properties_parse_metadata	Exceptionclose)
r3   rF   rT   rH   rJ   rK   rM   rO   rP   r/   r   r   r   r4      s@   
zSAS7BDATReader.__init__
np.ndarrayc                 C     t j| jt jdS )z5Return a numpy int64 array of the column data lengthsdtype)npasarrayr[   int64r3   r   r   r   column_data_lengths      z"SAS7BDATReader.column_data_lengthsc                 C  rh   )z0Return a numpy int64 array of the column offsetsri   )rk   rl   r\   rm   rn   r   r   r   column_data_offsets   rp   z"SAS7BDATReader.column_data_offsetsc                 C  s   t j| jt ddS )zj
        Returns a numpy character array of the column types:
           s (string) or d (double)
        S1ri   )rk   rl   r]   rj   rn   r   r   r   column_types   s   zSAS7BDATReader.column_typesc                 C  s   | j   d S r2   )r`   rf   rn   r   r   r   rf      s   zSAS7BDATReader.closec                 C  s  | j d | j d| _| jdttj tjkrtdd\}}| tj	tj
}|tjkrAtj}d| _d| _tj| _tj| _nd| _tj| _tj| _d| _| tjtj}|tjkr_tj}|| }| tjtj}|d	krsd
| _nd| _| tjtjd }|tjv rtj| | _nd| d| _| tjtj }|dkrd| _!n|dkrd| _!nd| _!| "tj#tj$| _%| "tj&tj'| _(t)ddd}| *tj+| tj,}|t-j.|dd | _/| *tj0| tj1}|t-j.|dd | _2| 3tj4| tj5| _6| j | j6d }|  j|7  _t| j| j6krtd| 3tj7| tj8| _9| 3tj:| tj;| _<| "tj=| tj>| _=| "tj?| tj@| _A| "tjB| tjC| _D| "tjE| tjF| _G| jGsi| "tjH| tjI| _Gd S d S )Nr   i   z'magic number mismatch (not a SAS file?)r   r   T   F      <>zunknown (code=)   1unix   2windowsunknownr   r   r   r%   z*The SAS7BDAT file appears to be truncated.)Jrb   seekreadrD   lenconstmagicr   _read_bytesalign_1_offsetalign_1_lengthu64_byte_checker_valuealign_2_valueU64rC   page_bit_offset_x64_page_bit_offsetsubheader_pointer_length_x64_subheader_pointer_lengthpage_bit_offset_x86subheader_pointer_length_x86align_2_offsetalign_2_lengthalign_1_checker_valueendianness_offsetendianness_length
byte_orderencoding_offsetencoding_lengthencoding_namesfile_encodingplatform_offsetplatform_lengthplatform_read_and_convert_header_textdataset_offsetdataset_lengthr=   file_type_offsetfile_type_length	file_typer   _read_floatdate_created_offsetdate_created_lengthr   to_timedeltadate_createddate_modified_offsetdate_modified_lengthdate_modified	_read_intheader_size_offsetheader_size_lengthheader_lengthpage_size_offsetpage_size_length_page_lengthpage_count_offsetpage_count_length_page_countsas_release_offsetsas_release_lengthsas_server_type_offsetsas_server_type_lengthserver_typeos_version_number_offsetos_version_number_length
os_versionos_name_offsetos_name_lengthos_nameos_maker_offsetos_maker_length)r3   align1align2buftotal_alignepochxr   r   r   rc      s   




zSAS7BDATReader._get_propertiesr   c                 C  s(   | j | jpdd}|jr|   t|S )Nr   )nrows)r   rK   emptyrf   StopIteration)r3   dar   r   r   __next__S  s
   zSAS7BDATReader.__next__r-   widthc                 C  sJ   |dvr|    td| ||}|dkrdnd}t| j| |d S )N)rv   ru   zinvalid float widthrv   fr   r   rf   r   r   structunpackr   )r3   r-   r   r   fdr   r   r   r   [  s   zSAS7BDATReader._read_floatc                 C  sP   |dvr|    td| ||}ddddd| }t| j| |d }|S )N)r      rv   ru   zinvalid int widthbhlqr   r   )r3   r-   r   r   itivr   r   r   r   d  s   zSAS7BDATReader._read_intr.   c                 C  s   | j d u r,| j| | j|}t||k r*|   d|dd|dd}t||S || t| j kr=|   td| j |||  S )NzUnable to read r   z bytes from file position .zThe cached page is too small.)rD   rb   r   r   r   rf   r   )r3   r-   r.   r   msgr   r   r   r   m  s   
zSAS7BDATReader._read_bytesr<   c                 C  s   |  | ||dS )N     )_convert_header_textr   rstripr3   r-   r.   r   r   r   r   |  s   z,SAS7BDATReader._read_and_convert_header_textc                 C  sV   d}|s)| j | j| _t| jdkrd S t| j| jkr!td|  }|rd S d S )NFr   z2Failed to read a meta data page from the SAS file.)rb   r   r   rD   r   r   _process_page_meta)r3   doner   r   r   rd     s   zSAS7BDATReader._parse_metadatac                 C  sZ   |    tjtjtjg }| j|v r|   | jtjk}| jtjk}t|p+|p+| j	g kS r2   )
_read_page_headerr   page_meta_typespage_amd_typepage_mix_type_current_page_type_process_page_metadatapage_data_typerI   rZ   )r3   ptis_data_pageis_mix_pager   r   r   r     s   
z!SAS7BDATReader._process_page_metac                 C  s^   | j }tj| }| |tjtj@ | _tj| }| |tj| _	tj
| }| |tj| _d S r2   )r   r   page_type_offsetr   page_type_lengthpage_type_mask2r   block_count_offsetblock_count_length_current_page_block_countsubheader_count_offsetsubheader_count_length_current_page_subheaders_count)r3   
bit_offsettxr   r   r   r     s   



z SAS7BDATReader._read_page_headerc                 C  sp   | j }t| jD ]-}| tj| |}|jdkrq|jtjkr q| 	|j
}| ||j|j}| || qd S )Nr   )r   ranger   _process_subheader_pointersr   subheader_pointers_offsetr.   r/   truncated_subheader_id_read_subheader_signaturer-   _get_subheader_indexr0   _process_subheader)r3   r   ipointersubheader_signaturesubheader_indexr   r   r   r     s   


z%SAS7BDATReader._process_page_metadata	signaturer@   c                 C  sb   t j|}|d u r/|t jkp|dk}|t jk}| jdkr'|r'|r't jj}|S |   t	d|S )Nr   rQ   zUnknown subheader signature)
r   subheader_signature_to_indexgetcompressed_subheader_idcompressed_subheader_typer/   SASIndexdata_subheader_indexrf   r   )r3   r  r/   r0   rT   f1f2r   r   r   r     s   
z#SAS7BDATReader._get_subheader_indexsubheader_pointer_indexr+   c           
      C  st   | j }|||  }| || j}|| j7 }| || j}|| j7 }| |d}|d7 }| |d}t||||}	|	S )Nr   )r   r   rC   r+   )
r3   r-   r  subheader_pointer_lengthtotal_offsetsubheader_offsetsubheader_lengthsubheader_compressionsubheader_typer   r   r   r   r     s   

z*SAS7BDATReader._process_subheader_pointersc                 C  s   |  || j}|S r2   )r   rC   )r3   r-   r  r   r   r   r     s   z(SAS7BDATReader._read_subheader_signaturer  r   c                 C  s   |j }|j}|tjjkr| j}nX|tjjkr| j}nN|tjjkr$| j	}nD|tjj
kr.| j}n:|tjjkr8| j}n0|tjjkrB| j}n&|tjjkrL| j}n|tjjkrV| j}n|tjjkrd| j| d S td||| d S )Nzunknown subheader index)r-   r.   r   r  row_size_index_process_rowsize_subheadercolumn_size_index_process_columnsize_subheadercolumn_text_index_process_columntext_subheadercolumn_name_index_process_columnname_subheadercolumn_attributes_index#_process_columnattributes_subheaderformat_and_label_index_process_format_subheadercolumn_list_index_process_columnlist_subheadersubheader_counts_index_process_subheader_countsr	  rZ   appendr   )r3   r  r   r-   r.   	processorr   r   r   r     s.   z!SAS7BDATReader._process_subheaderc                 C  s   | j }|}|}| jr|d7 }|d7 }n|d7 }|d7 }| |tj|  || _| |tj|  || _| |tj|  || _	| |tj
|  || _tj| }| || || _| |d| _| |d| _d S )Ni  i  ib  iz  r   )rC   r   r   r   row_length_offset_multiplier
row_lengthrow_count_offset_multiplier	row_countcol_count_p1_multipliercol_count_p1col_count_p2_multipliercol_count_p2'row_count_on_mix_page_offset_multiplier_mix_page_row_count_lcs_lcp)r3   r-   r.   int_len
lcs_offset
lcp_offsetmxr   r   r   r    s0   

z)SAS7BDATReader._process_rowsize_subheaderc                 C  sX   | j }||7 }| ||| _| j| j | jkr*td| j d| j d| j d d S d S )Nz Warning: column count mismatch (z + z != z)
)rC   r   column_countr*  r,  print)r3   r-   r.   r1  r   r   r   r    s   
z,SAS7BDATReader._process_columnsize_subheaderc                 C     d S r2   r   r   r   r   r   r"  %  s   z(SAS7BDATReader._process_subheader_countsc           	      C  s  || j 7 }| |tj}| ||}|d| d}| j| t| jdkrd}tj	D ]}||v r5|}q-|| _
|| j 8 }|d }| jrI|d7 }| || j}|d}|dkrwd| _|d }| jrg|d7 }| || j}|d| j | _nB|tjkr|d	 }| jr|d7 }| || j}|d| j | _n"| jdkrd| _|d }| jr|d7 }| || j}|d| j | _t| d
r| | j| _d S d S d S )Nr   r   r   rQ      rv           (   creator_proc)rC   r   r   text_block_size_lengthr   r   rV   r#  r   compression_literalsr/   r   r0  r/  r<  rle_compressionhasattrr   )	r3   r-   r.   text_block_sizer   	cname_rawcompression_literalcloffset1r   r   r   r  (  sT   






"z,SAS7BDATReader._process_columntext_subheaderc                 C  s   | j }||7 }|d|  d d }t|D ]Q}|tj|d   tj }|tj|d   tj }|tj|d   tj }| |tj}	| |tj	}
| |tj
}| j|	 }||
|
|  }| j| | qd S )Nr      ru   r   )rC   r   r   column_name_pointer_length!column_name_text_subheader_offsetcolumn_name_offset_offsetcolumn_name_length_offsetr   !column_name_text_subheader_lengthcolumn_name_offset_lengthcolumn_name_length_lengthrV   rW   r#  r   )r3   r-   r.   r1  column_name_pointers_countr   text_subheadercol_name_offsetcol_name_lengthidx
col_offsetcol_lenname_rawcnamer   r   r   r  V  sB   
z,SAS7BDATReader._process_columnname_subheaderc           
      C  s   | j }|d|  d |d  }t|D ]Y}|| tj ||d   }|d|  tj ||d   }|d|  tj ||d   }| ||}	| j|	 | |tj	}	| j
|	 | |tj}	| j|	dkridnd qd S )Nr   rF  ru   r      d   s)rC   r   r   column_data_offset_offsetcolumn_data_length_offsetcolumn_type_offsetr   r\   r#  column_data_length_lengthr[   column_type_lengthr]   )
r3   r-   r.   r1  column_attributes_vectors_countr   col_data_offsetcol_data_len	col_typesr   r   r   r   r  w  s,   
z2SAS7BDATReader._process_columnattributes_subheaderc                 C  r7  r2   r   r   r   r   r   r     s   z,SAS7BDATReader._process_columnlist_subheaderc                 C  sx  | j }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }|tj d|  }	| |tj	}
t
|
t| jd }| |tj}| |tj}| |tj}t
|t| jd }| |tj}| |	tj}| j| }| ||||  }| j| }| ||||  }t| j}t|| j| ||| j| | j| }| j| | j| d S )N   r   )rC   r   )column_format_text_subheader_index_offsetcolumn_format_offset_offsetcolumn_format_length_offset(column_label_text_subheader_index_offsetcolumn_label_offset_offsetcolumn_label_length_offsetr   )column_format_text_subheader_index_lengthminr   rV   column_format_offset_lengthcolumn_format_length_length(column_label_text_subheader_index_lengthcolumn_label_offset_lengthcolumn_label_length_lengthr   rY   r:   rW   r]   r[   rX   r#  )r3   r-   r.   r1  text_subheader_formatcol_format_offsetcol_format_lentext_subheader_labelcol_label_offsetcol_label_lenr   
format_idxformat_start
format_len	label_idxlabel_start	label_lenlabel_namescolumn_labelformat_namescolumn_formatcurrent_column_numbercolr   r   r   r    sT   


	z(SAS7BDATReader._process_format_subheaderr   c                 C  s   |d u r| j d ur| j }n|d u r| j}t| jdkr#|   td|dkr0| j| jkr0t S | j| j }||kr<|}| jd}| jd}t	j
||ftd| _t	j|d| ft	jd| _d| _t| }|| |  }| jd urz|| j}|S )Nr   zNo columns to parse from filerW  rX  ri   ru   )rK   r(  r   r]   rf   r
   r^   r   countrk   r   object_string_chunkzerosuint8_byte_chunk_current_row_in_chunk_indexr   r   _chunk_to_dataframerT   	set_index)r3   r   mndnsprsltr   r   r   r     s.   

zSAS7BDATReader.readc                 C  s   g | _ | j| j| _t| jdkrdS t| j| jkr3|   dt| jdd| jdd}t||   | j	t
jv rA|   | j	t
jt
jt
jg vrQ|  S dS )Nr   Tz-failed to read complete page from file (read r   z of z bytes)F)rZ   rb   r   r   rD   r   rf   r   r   r   r   r   r   r   r   _read_next_page)r3   r   r   r   r   r    s,   
zSAS7BDATReader._read_next_pagec                 C  s  | j }| j}t|| |}i }d\}}t| jD ]}| j| }| j| dkrl| j|d d f j| jd d}	t	j
|	tj|d||< | jrg| j| tjv rVt|| d||< n| j| tjv rgt|| d||< |d7 }q| j| dkrt	j
| j|d d f |d	||< | jr| jd ur| || j||< | jr|| j d
k}
tj|| |
< |d7 }q|   tdt| j|  t|| j|dd}|S )Nrt   rW  r   ri   )rj   rT   r   r   rX  )rT   r   zunknown column type F)rY   rT   copy)r  r^   r   r5  rW   r]   r  viewr   r   r(   rk   float64rH   rX   r   sas_date_formatsr*   sas_datetime_formatsr  rO   rM   _decode_stringr   rJ   r   nanrf   r   reprr   )r3   nr  ixr  jsjbjr=   col_arriidfr   r   r   r    s8   
 
 
z"SAS7BDATReader._chunk_to_dataframec                 C  s   | | jp| jS r2   )decoderM   rU   r3   r   r   r   r   r  )  s   zSAS7BDATReader._decode_stringr   c                 C  s   | j r| |S |S r2   )rP   r  r  r   r   r   r   ,  s   
z#SAS7BDATReader._convert_header_text)NTTNNTTrE   )rF   rG   rH   rI   rJ   rI   rK   rL   rM   rN   rO   rI   rP   rI   r/   r   r#   r1   )r#   rg   )r#   r1   )r#   r   )r-   r,   r   r,   )r-   r,   r   r,   r#   r,   )r-   r,   r.   r,   )r-   r,   r.   r,   r#   r<   )r#   rI   )r  r@   r#   r,   )r-   r,   r  r,   r#   r+   )r-   r,   r#   r@   )r  r,   r   r+   r#   r1   )r-   r,   r.   r,   r#   r1   r2   )r   rL   r#   r   )r   r@   r#   r<   )%r6   r7   r8   __doc__r9   r4   ro   rq   rs   rf   rc   r   r   r   r   r   rd   r   r   r   r   r   r   r   r  r  r"  r  r  r  r   r  r   r  r  r  r   r   r   r   r   rB      sX   
 
3




l

	
	













.
!

5"
$rB   )r   r   r   r   )r!   r"   r   r   r#   r"   )(r  
__future__r   collectionsr   r   r   r   typingr   numpyrk   pandas._typingr   r   r	   pandas.errorsr
   r   pandasr   r   r   pandas.io.commonr   pandas.io.sas._sasr   pandas.io.sas.sas_constantsiosassas_constantsr   pandas.io.sas.sasreaderr   r    r*   r+   r:   IteratorrB   r   r   r   r   <module>   s(    

