o
    i%                     @   s8   d dl Z d dlZG dd dZG dd dZddgZdS )    Nc                   @   s"   e Zd ZdZdd ZdddZdS )Perlunipropsa3  
    This class is used to read lists of characters from the Perl Unicode
    Properties (see http://perldoc.perl.org/perluniprops.html).
    The files in the perluniprop.zip are extracted using the Unicode::Tussle
    module from http://search.cpan.org/~bdfoy/Unicode-Tussle-1.11/lib/Unicode/Tussle.pm
    c                 C   s(   t jt jtd | _g d| _d S )Nz/data/perluniprops/)Close_PunctuationCurrency_SymbolIsAlnumIsAlphaIsLowerIsNIsScIsSoIsUpperLine_SeparatorNumberOpen_PunctuationPunctuation	SeparatorSymbolLowercase_LetterTitlecase_LetterUppercase_LetterIsPfIsPi
CJKSymbolsCJK)ospathdirnameabspath__file__datadiravailable_categoriesself r"   L/var/www/edux/Edux_v2/venv/lib/python3.10/site-packages/sacremoses/corpus.py__init__   s   zPerluniprops.__init__Nc                 c   s<    t jdd|d }td|}|dD ]}|V  qdS )u  
        This module returns a list of characters from  the Perl Unicode Properties.
        They are very useful when porting Perl tokenizers to Python.

            >>> from sacremoses.corpus import Perluniprops
            >>> pup = Perluniprops()
            >>> list(pup.chars('Open_Punctuation'))[:5] == [u'(', u'[', u'{', u'༺', u'༼']
            True
            >>> list(pup.chars('Currency_Symbol'))[:5] == [u'$', u'¢', u'£', u'¤', u'¥']
            True
            >>> pup.available_categories[:5]
            ['Close_Punctuation', 'Currency_Symbol', 'IsAlnum', 'IsAlpha', 'IsLower']

        :return: a generator of characters given the specific unicode character category
        dataperlunipropsz.txt
sacremosesutf-8N)r   r   joinpkgutilget_datadecode)r!   categoryrelative_pathbinary_datachr"   r"   r#   chars.   s   zPerluniprops.chars)N)__name__
__module____qualname____doc__r$   r1   r"   r"   r"   r#   r      s    r   c                   @   s"   e Zd ZdZdd ZdddZdS )	NonbreakingPrefixesz
    This is a class to read the nonbreaking prefixes textfiles from the
    Moses Machine Translation toolkit. These lists are used in the Python port
    of the Moses' word tokenizer.
    c                 C   s  t jt jtd | _i dddddddd	d
dddddddddddddddddddddd d!d"d#i d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdEdFdGdHdIdJ| _| jdKdL | j D  d S )MNz/data/nonbreaking_prefixes/assameseasbengalibncatalancaczechcsgermandegreekelenglishenspanishesestonianetfinnishfifrenchfririshgagujaratiguhindihi	hungarianhu	icelandicisitalianitkannadakn
lithuanianltlatvianlv	malayalammlmanipurimnimarathimrdutchnloriyaorpunjabipapolishpl
portugueseptromanianrorussianruslovaksk	slovenianslswedishsvtamiltatetdtyuezh)telugutetum	cantonesechinesec                 S   s   i | ]}||qS r"   r"   .0vr"   r"   r#   
<dictcomp>x   s    z0NonbreakingPrefixes.__init__.<locals>.<dictcomp>)	r   r   r   r   r   r   available_langsupdatevaluesr    r"   r"   r#   r$   K   s   	
 !"#
 )zNonbreakingPrefixes.__init__N#c                 c   s    || j v rd| j |  g}n|dkr dd t| j  D }ndg}|D ](}tjdd|}td|}|d		 D ]}|
 }|rL||sL|V  q<q%dS )
u  
        This module returns a list of nonbreaking prefixes for the specified
        language(s).

            >>> from sacremoses.corpus import NonbreakingPrefixes
            >>> nbp = NonbreakingPrefixes()
            >>> list(nbp.words('en'))[:10] == [u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J']
            True
            >>> list(nbp.words('ta'))[:5] == ['ர', 'ூ', 'திரு', 'ஏ', 'பீ']
            True

        :return: a generator words for the specified language(s).
        nonbreaking_prefix.Nc                 S   s   g | ]}d | qS )r   r"   r   r"   r"   r#   
<listcomp>   s    z-NonbreakingPrefixes.words.<locals>.<listcomp>znonbreaking_prefix.enr%   nonbreaking_prefixesr'   r(   )r   setr   r   r   r)   r*   r+   r,   
splitlinesstrip
startswith)r!   langignore_lines_startswith	filenamesfilenamer.   r/   liner"   r"   r#   wordsz   s$   
zNonbreakingPrefixes.words)Nr   )r2   r3   r4   r5   r$   r   r"   r"   r"   r#   r6   D   s    /r6   )r   r*   r   r6   __all__r"   r"   r"   r#   <module>   s
   <X