Source code for indictrans.transliterator

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright (C) 2016 Irshad Ahmad Bhat

from ._decode import DECODERS
from .script_transliterate import (Ind2RU, Rom2Ind,
                                   Urd2Ind, Ind2Ind)


def _get_decoder(decode):
    try:
        return DECODERS[decode]
    except KeyError:
        raise ValueError('Unknown decoder {0!r}'.format(decode))


[docs]class Transliterator():
    """Transliterator for Indic scripts including English and Urdu.

    Parameters
    ----------

    source : str, default: hin
        Source Language (3 letter ISO-639 code)

    target : str, default: eng
        Target Language (3 letter ISO-639 code)

    decode : str, default: viterbi
        Decoding algorithm, either "viterbi" or "beamsearch".

    build_lookup : bool, default: False
        Flag to build lookup-table. Fastens the transliteration
        process if the input text contains repeating words.

    Examples
    --------

    >>> from indictrans import Transliterator
    >>> trn = Transliterator(source='hin', target='eng', build_lookup=True)
    >>> hin = '''कांग्रेस पार्टी अध्यक्ष सोनिया गांधी, तमिलनाडु की मुख्यमंत्री
    ... जयललिता और रिज़र्व बैंक के गवर्नर रघुराम राजन के बीच एक
    ... समानता है. ये सभी अलग-अलग कारणों से भारतीय जनता पार्टी के
    ... राज्यसभा सांसद सुब्रमण्यम स्वामी के निशाने पर हैं. उनके
    ... जयललिता और सोनिया गांधी के पीछे पड़ने का कारण कथित
    ... भ्रष्टाचार है.'''
    >>> eng = trn.transform(hin)
    >>> print(eng)
    congress party adhyaksh sonia gandhi, tamilnadu kii mukhyamantri
    jayalalita our reserve baink ke governor raghuram rajan ke beech ek
    samanta hai. ye sabi alag-alag carnon se bharatiya janata party ke
    rajyasabha saansad subramanyam swami ke nishane par hain. unke
    jayalalita our sonia gandhi ke peeche padane ka kaaran kathith
    bhrashtachar hai.
    """
    def __init__(self, source='hin', target='eng',
                 decode='viterbi', build_lookup=False):
        source = source.lower()
        target = target.lower()
        o_src, o_trg = source, target
        if source in ('mar', 'nep', 'kok', 'bod'):
            source = 'hin'
        elif source == 'asm':
            source = 'ben'
        if target in ('mar', 'nep', 'kok', 'bod'):
            target = 'hin'
        elif target == 'asm':
            target = 'ben'
        indic = 'hin guj pan ben mal kan tam tel ori'.split()
        decoder = (decode, _get_decoder(decode))
        if source in ['eng', 'urd']:
            if target not in indic:
                raise NotImplementedError(
                    'Language pair `%s-%s` is not implemented.' %
                    (o_src, o_trg))
            if source == 'eng':
                ru2i_trans = Rom2Ind(source, target, decoder, build_lookup)
            else:
                ru2i_trans = Urd2Ind(source, target, decoder, build_lookup)
            if decode == 'viterbi':
                self.transform = ru2i_trans.transliterate
            else:
                self.transform = ru2i_trans.top_n_trans
        elif target in ['eng', 'urd']:
            if source not in indic:
                raise NotImplementedError(
                    'Language pair `%s-%s` is not implemented.' %
                    (o_src, o_trg))
            i2o_trans = Ind2RU(source, target, decoder, build_lookup)
            if decode == 'viterbi':
                self.transform = i2o_trans.transliterate
            else:
                self.transform = i2o_trans.top_n_trans
        else:
            if source not in indic or target not in indic or source == target:
                raise NotImplementedError(
                    'Language pair `%s-%s` is not implemented.' %
                    (o_src, o_trg))
            i2i_trans = Ind2Ind(source, target)
            self.transform = i2i_trans.rtrans

    def convert(self, line):
        return self.transform(line)