
    piX                       d dl mZ d dlZd dlmZ d dlmZ ddlmZm	Z	m
Z
mZ ddlmZmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZmZ  ej        d          Z ej                     Z!e!"                     ej#        d                     	 	 	 	 	 	 	 	 	 d.d/d$Z$	 	 	 	 	 	 	 	 	 d.d0d'Z%	 	 	 	 	 	 	 	 	 d.d1d*Z&	 	 	 	 	 	 	 	 	 d2d3d-Z'dS )4    )annotationsN)PathLike)BinaryIO   )coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratios)IANA_SUPPORTEDIANA_SUPPORTED_SIMILARTOO_BIG_SEQUENCETOO_SMALL_SEQUENCETRACE)
mess_ratio)CharsetMatchCharsetMatches)any_specified_encodingcut_sequence_chunks	iana_nameidentify_sig_or_bomis_multi_byte_encodingshould_strip_sig_or_bomcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s      皙?TF皙?	sequencesbytes | bytearraystepsint
chunk_size	thresholdfloatcp_isolationlist[str] | Nonecp_exclusionpreemptive_behaviourboolexplainlanguage_thresholdenable_fallbackreturnr   c
                j   t          | t          t          f          s/t          d                    t          |                               |rJt          j        }
t                              t                     t          
                    t                     t          |           }|dk    rwt                              d           |r9t                              t                     t          
                    |
           t          t!          | dddg d          g          S |At                              t          d	d
                    |                     d |D             }ng }|At                              t          dd
                    |                     d |D             }ng }|||z  k    r't                              t          d|||           d}|}|dk    r||z  |k     rt'          ||z            }t          |           t(          k     }t          |           t*          k    }|r4t                              t          d                    |                     n5|r3t                              t          d                    |                     g }|rt-          |           nd}|6|                    |           t                              t          d|           t1                      }g }g }t1                      }d}d}d}t                      }t                      }t3          |           \  }}|D|                    |           t                              t          dt          |          |           |                    d           d|vr|                    d           |t4          z   D ]}|r||vr
|r||v r||v r|                    |           d}||k    }|ot9          |          }|dv r$|s"t                              t          d|           l|dv r$|s"t                              t          d|           ||v r"t                              t          d|           	 t;          |          }n8# t<          t>          f$ r$ t                              t          d|           Y w xY w	 |rS|du rOtA          |du r| dt'          d                   n#| t          |          t'          d                   |           n,tA          |du r| n| t          |          d         |          }nx# tB          tD          f$ rd} t          | tD                    s/t                              t          d|tA          |                      |                    |           Y d} ~ d} ~ ww xY wtG          |sdnt          |          |t'          ||z                      }!|o|duot          |          |k     }"|"r!t                              t          d|           t'          t          |!          dz            }#tI          |#d           }#d}$d}%g }&g }'	 tK          | ||!||||||	  	        D ]y}(|&                    |(           |'                    tM          |(||d!u odt          |          cxk    od k    nc                      |'d"         |k    r|$dz  }$|$|#k    s|r|du r nznJ# tB          $ r=} t                              t          d#|tA          |                      |#}$d!}%Y d} ~ nd} ~ ww xY w|%s|r|s	 | t'          d$          d         '                    |d%&           n\# tB          $ rO} t                              t          d'|tA          |                      |                    |           Y d} ~ d} ~ ww xY w|'rtQ          |'          t          |'          z  nd})|)|k    s|$|#k    r|                    |           |tR          v r |*                    tR          |                    t                              t          d(||$tW          |)d)z  d*+                     |	r5|dd|d,d-fv r,|%s*t!          | |||g ||.          }*||k    r|*}n|dk    r|*}n|*}t                              t          d/|tW          |)d)z  d*+                     |stY          |          }+nt[          |          }+|+rAt                              t          d0                    |tA          |+                               g },|dk    rB|&D ]?}(t]          |(||+rd1                    |+          nd          }-|,                    |-           @t_          |,          }.|.r4t                              t          d2                    |.|                     t!          | ||)||.|du s||ddfv r|nd|.          }/|                    |/           ||ddfv r|)d3k     r|)dk    rmt                              d4|/j0                   |r9t                              t                     t          
                    |
           t          |/g          c S |                    |/           t          |          r|||v rd|v rd|v r|1                                }0t                              d4|0j0                   |r9t                              t                     t          
                    |
           t          |0g          c S ||k    rnt                              d5|           |r9t                              t                     t          
                    |
           t          ||         g          c S t          |          dk    r|s|s|r t                              t          d6           |r6t                              d7|j0                   |                    |           n{|r||r|r|j2        |j2        k    s|0t                              d8           |                    |           n1|r/t                              d9           |                    |           |rDt                              d:|1                                j0        t          |          dz
             nt                              d;           |r9t                              t                     t          
                    |
           |S )<af  
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
    toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
    Custom logging format and handler can be set manually.
    z3Expected object of type bytes or bytearray, got: {}r   z<Encoding detection on empty bytes, assuming utf_8 intention.utf_8g        F Nz`cp_isolation is set. use this flag for debugging purpose. limited list of encoding allowed : %s.z, c                .    g | ]}t          |d           S Fr   .0cps     I/var/www/icac/venv/lib/python3.11/site-packages/charset_normalizer/api.py
<listcomp>zfrom_bytes.<locals>.<listcomp>`   "    DDD	"e,,DDD    zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.c                .    g | ]}t          |d           S r2   r3   r4   s     r7   r8   zfrom_bytes.<locals>.<listcomp>k   r9   r:   z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.r   z>Trying to detect encoding from a tiny portion of ({}) byte(s).zIUsing lazy str decoding because the payload is quite large, ({}) byte(s).z@Detected declarative mark in sequence. Priority +1 given for %s.zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>   utf_16utf_32z\Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.>   utf_7zREncoding %s won't be tested as-is because detection is unreliable without BOM/SIG.zY%s is deemed too similar to a code page that was already considered unsuited. Continuing!z2Encoding %s does not provide an IncrementalDecoderg    A)encodingz9Code page %s does not fit given bytes sequence at ALL. %szpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.      TzaLazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %sg     j@strict)errorsz^LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %szc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.d      )ndigitsr=   r>   )preemptive_declarationz=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {},z We detected language {} using {}r   z.Encoding detection: %s is most likely the one.zoEncoding detection: %s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.zONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z7Encoding detection: %s will be used as a fallback matchz:Encoding detection: utf_8 will be used as a fallback matchz:Encoding detection: ascii will be used as a fallback matchz]Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.z=Encoding detection: Unable to determine any suitable charset.)3
isinstance	bytearraybytes	TypeErrorformattypeloggerlevel
addHandlerexplain_handlersetLevelr   lendebugremoveHandlerr   r   logjoinr!   r   r   r   appendsetr   r   addr   r   ModuleNotFoundErrorImportErrorstrUnicodeDecodeErrorLookupErrorrangemaxr   r   decodesumr   updateroundr   r	   r   r
   r@   bestfingerprint)1r   r    r"   r#   r%   r'   r(   r*   r+   r,   previous_logger_levellengthis_too_small_sequenceis_too_large_sequenceprioritized_encodingsspecified_encodingtestedtested_but_hard_failuretested_but_soft_failuresoft_failure_skipfallback_asciifallback_u8fallback_specifiedresultsearly_stop_resultssig_encodingsig_payloadencoding_ianadecoded_payloadbom_or_sig_availablestrip_sig_or_bomis_multi_byte_decoderer_multi_byte_bonusmax_chunk_gave_upearly_stop_countlazy_str_hard_failure	md_chunks	md_ratioschunkmean_mess_ratiofallback_entrytarget_languages	cd_ratioschunk_languagescd_ratios_mergedcurrent_matchprobable_results1                                                    r7   
from_bytesr   &   s   < i)U!344 
AHHY 
 
 	
  %+\/***i..F{{STTT 	3  111OO1222|IwUBPRSSTUUU

5IIl##		
 	
 	
 ED|DDD

6IIl##		
 	
 	
 ED|DDD*u$%%

l	
 	
 	
 
qyyVe^j00%((
"%i..3E"E"%i..4D"D 


LSS 	
 	
 	
 	
 
 


W^^ 	
 	
 	
 (* .BKy)))t  %$$%7888

N	
 	
 	
 uuF)+)+"%%%*.N'+K.2,..G)7)9)9 3I > >L+$$\222

W		
 	
 	
   )))+++$$W---.? {< {< 	M== 	M\99F""

=!!!&*%1]%B!5 "
:Q;
 ;
 0009M0JJn  
 I%%.B%JJd  
  ---JJk  
 	*@*O*O!!#[1 	 	 	JJD  
 H		$ )>%)G)G ,u44 "+CII+..&s;'7'7#d))'CD*     #& ,u44 "	&s;'7'7'9'9:*# # # #K0 		 		 		a-- 

O!FF	   $**=999HHHH		 )?AAs;/?/?
 
 " .t+.O$$v- 	  	JJ-	   "%SWWq[!1!1 1155 ! %!		'	),$ %
 
     '''  !4GA\1B1B,G,G,G,Ga,G,G,G,G    R=I--$)$$(999( :-=-F-FE
	) 
	) 
	) JJsA	    1$(!!!!!!
	) &	%	 *	

#d))++&--mH-MMMM%   

t!FF	   (..}=== EN!VY#i..!@!@SVi''+;?P+P+P#**=999 666!(()?)NOOOJJ0 o+Q777    1!W&8(HMN N-N ".!(#+=" " " !$666)7&&"g--%3NN"0K

K/C'333		
 	
 	
 % 	D*<]*K*K4]CC 	JJ8??!3'7#8#8    	 G##" 2 2"1&2BLCHH-...# #   11111)<< 	JJ299$m    %  *U22$);Wg(NNN  
 #5
 
 
" 	}%%% 0'7CCC#%% #%%D!*    ;((999OO$9:::%}o66666%%m444 "##	5#+/AV/K/K6!!6!!,>,C,C,E,EOLL@(    7$$_555 5666!?"344444L((LL1  
  7$$_555 5666!7=#9":;;;;; ) 7||q 	. 	,> 	JJa  
  	+LLI"+   NN-....	++3 4 # 4  +~/III'LLUVVVNN;'''' 	+LLUVVVNN>*** VkLLNN#LL1	
 	
 	
 	
 	TUUU /_----...Ns^   4R2R98R9=BT??V4AV//V4B[//
\693\11\6 ,]--
_7A__fpr   c
                V    t          |                                 |||||||||	
  
        S )z
    Same thing than the function from_bytes but using a file pointer that is already ready.
    Will not close the file pointer.
    )r   read)
r   r    r"   r#   r%   r'   r(   r*   r+   r,   s
             r7   from_fpr   #  s<      
		  r:   pathstr | bytes | PathLikec
                    t          | d          5 }
t          |
|||||||||	
  
        cddd           S # 1 swxY w Y   dS )z
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
    Can raise IOError.
    rbN)openr   )r   r    r"   r#   r%   r'   r(   r*   r+   r,   r   s              r7   	from_pathr   A  s      
dD		 
R 
 

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
s   6::fp_or_path_or_payload!PathLike | str | BinaryIO | bytesc
                   t          | t          t          f          rt          | |||||||||	
  
        }
nOt          | t          t
          f          rt          | |||||||||	
  
        }
nt          | |||||||||	
  
        }
|
 S )a)  
    Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
    Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
    are disabled to be stricter around ASCII-compatible but unlikely to be a string.
    )	r    r"   r#   r%   r'   r(   r*   r+   r,   )rK   r`   r   r   rM   rL   r   r   )r   r    r"   r#   r%   r'   r(   r*   r+   r,   guessess              r7   	is_binaryr   `  s    " '#x99 ,
!!%%!51+
 
 
 
	

 
 
 !!%%!51+
 
 
 !!%%!51+
 
 
 ;r:   )	r   r   r   NNTFr   T)r   r   r    r!   r"   r!   r#   r$   r%   r&   r'   r&   r(   r)   r*   r)   r+   r$   r,   r)   r-   r   )r   r   r    r!   r"   r!   r#   r$   r%   r&   r'   r&   r(   r)   r*   r)   r+   r$   r,   r)   r-   r   )r   r   r    r!   r"   r!   r#   r$   r%   r&   r'   r&   r(   r)   r*   r)   r+   r$   r,   r)   r-   r   )	r   r   r   NNTFr   F)r   r   r    r!   r"   r!   r#   r$   r%   r&   r'   r&   r(   r)   r*   r)   r+   r$   r,   r)   r-   r)   )(
__future__r   loggingosr   typingr   cdr   r   r	   r
   constantr   r   r   r   r   mdr   modelsr   r   utilsr   r   r   r   r   r   	getLoggerrQ   StreamHandlerrT   setFormatter	Formatterr   r   r   r    r:   r7   <module>r      sz   " " " " " "                                            0 0 0 0 0 0 0 0                
	/	0	0'''))   GABB   %)%)!% # z z z z z~ %)%)!% #     @ %)%)!% # 
 
 
 
 
B %)%)!% #!? ? ? ? ? ? ?r:   