o
    i!                     @   s   zd dl mZ W n ey   d dl mZ Y nw d dl mZ d dlmZmZ d dlm	Z	m
Z
 d dlmZ G dd deZd	d
 Zdd Zdd Zdd ZdddZdddZdS )    )zip_longest)izip_longest)tee)escapeunescape)Paralleldelayed)tqdmc                   @   sZ   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZeeeeee	e
eeeegZdS )CJKCharsz
    An object that enumerates the code points of the CJK characters as listed on
    http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
    i   i  i.  iϤ  i@  i  i   i  i   i  i0  iO  ie  i  )io io i p i i  i/ ip i i   i N)__name__
__module____qualname____doc__Hangul_JamoCJK_RadicalsPhags_PaHangul_SyllablesCJK_Compatibility_IdeographsCJK_Compatibility_FormsKatakana_Hangul_Halfwidth#Ideographic_Symbols_And_PunctuationTangutKana_SupplementNushuSupplementary_Ideographic_Planeranges r'   r'   J/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/sacremoses/util.pyr
      s4    r
   c                    s   t  fdddD S )u  
    This checks for CJK character.

        >>> CJKChars().ranges
        [(4352, 4607), (11904, 42191), (43072, 43135), (44032, 55215), (63744, 64255), (65072, 65103), (65381, 65500), (94208, 101119), (110592, 110895), (110960, 111359), (131072, 196607)]
        >>> is_cjk(u'㏾')
        True
        >>> is_cjk(u'﹟')
        False

    :param character: The character that needs to be checked.
    :type character: char
    :return: bool
    c                    s,   g | ]\}}|t    ko|kn  qS r'   )ord).0startend	characterr'   r(   
<listcomp>s   s    zis_cjk.<locals>.<listcomp>)r   r   r   r   r   r   r   r   r   r   r   )anyr-   r'   r-   r(   is_cjkc   s
   
r1   c                 C      t | dddddddS )a  
    This function transforms the input text into an "escaped" version suitable
    for well-formed XML formatting.
    Note that the default xml.sax.saxutils.escape() function don't escape
    some characters that Moses does so we have to manually add them to the
    entities dictionary.

        >>> input_str = ''')| & < > ' " ] ['''
        >>> expected_output =  ''')| &amp; &lt; &gt; ' " ] ['''
        >>> escape(input_str) == expected_output
        True
        >>> xml_escape(input_str)
        ')&#124; &amp; &lt; &gt; &apos; &quot; &#93; &#91;'

    :param text: The text that needs to be escaped.
    :type text: str
    :rtype: str
    &apos;&quot;&#124;&#91;&#93;)'"|[]entities)r   textr'   r'   r(   
xml_escape   s   rA   c                 C   r2   )ai  
    This function transforms the "escaped" version suitable
    for well-formed XML formatting into humanly-readable string.
    Note that the default xml.sax.saxutils.unescape() function don't unescape
    some characters that Moses does so we have to manually add them to the
    entities dictionary.

        >>> from xml.sax.saxutils import unescape
        >>> s = ')&#124; &amp; &lt; &gt; &apos; &quot; &#93; &#91;'
        >>> expected = ''')| & < > ' " ] ['''
        >>> xml_unescape(s) == expected
        True

    :param text: The text that needs to be unescaped.
    :type text: str
    :rtype: str
    r8   r9   r:   r;   r<   )r3   r4   r5   r6   r7   r=   )r   r?   r'   r'   r(   xml_unescape   s   rB   c                 C   s    t | \}}t|d t||S )zp
    From https://docs.python.org/3/library/itertools.html#recipes
    s -> (s0,s1), (s1,s2), (s2, s3), ...
    N)r   nextzip)iterableabr'   r'   r(   pairwise   s   

rH   Nc                 C   s   t | g| }t|d|iS )ziCollect data into fixed-length chunks or blocks
    from https://stackoverflow.com/a/16789869/610569
    	fillvalue)iterr   )rE   nrI   argsr'   r'   r(   grouper   s   rM   Fc                    s>   |rt |n|}|dkrt |S t|d fdd|D S )N   )n_jobsc                 3   s    | ]	}t  |V  qd S N)r   )r*   linefuncr'   r(   	<genexpr>   s    z)parallelize_preprocess.<locals>.<genexpr>)r	   mapr   )rS   iterator	processesprogress_barr'   rR   r(   parallelize_preprocess   s   
rY   rP   )F)	itertoolsr   ImportErrorr   r   xml.sax.saxutilsr   r   joblibr   r   r	   objectr
   r1   rA   rB   rH   rM   rY   r'   r'   r'   r(   <module>   s    S#

	