Source code for indictrans.transliterator

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright (C) 2016 Irshad Ahmad Bhat

from ._decode import DECODERS
from .script_transliterate import (Ind2RU, Rom2Ind,
                                   Urd2Ind, Ind2Ind)


def _get_decoder(decode):
    try:
        return DECODERS[decode]
    except KeyError:
        raise ValueError('Unknown decoder {0!r}'.format(decode))


[docs]class Transliterator(): """Transliterator for Indic scripts including English and Urdu. Parameters ---------- source : str, default: hin Source Language (3 letter ISO-639 code) target : str, default: eng Target Language (3 letter ISO-639 code) decode : str, default: viterbi Decoding algorithm, either "viterbi" or "beamsearch". build_lookup : bool, default: False Flag to build lookup-table. Fastens the transliteration process if the input text contains repeating words. Examples -------- >>> from indictrans import Transliterator >>> trn = Transliterator(source='hin', target='eng', build_lookup=True) >>> hin = '''कांग्रेस पार्टी अध्यक्ष सोनिया गांधी, तमिलनाडु की मुख्यमंत्री ... जयललिता और रिज़र्व बैंक के गवर्नर रघुराम राजन के बीच एक ... समानता है. ये सभी अलग-अलग कारणों से भारतीय जनता पार्टी के ... राज्यसभा सांसद सुब्रमण्यम स्वामी के निशाने पर हैं. उनके ... जयललिता और सोनिया गांधी के पीछे पड़ने का कारण कथित ... भ्रष्टाचार है.''' >>> eng = trn.transform(hin) >>> print(eng) congress party adhyaksh sonia gandhi, tamilnadu kii mukhyamantri jayalalita our reserve baink ke governor raghuram rajan ke beech ek samanta hai. ye sabi alag-alag carnon se bharatiya janata party ke rajyasabha saansad subramanyam swami ke nishane par hain. unke jayalalita our sonia gandhi ke peeche padane ka kaaran kathith bhrashtachar hai. """ def __init__(self, source='hin', target='eng', decode='viterbi', build_lookup=False): source = source.lower() target = target.lower() o_src, o_trg = source, target if source in ('mar', 'nep', 'kok', 'bod'): source = 'hin' elif source == 'asm': source = 'ben' if target in ('mar', 'nep', 'kok', 'bod'): target = 'hin' elif target == 'asm': target = 'ben' indic = 'hin guj pan ben mal kan tam tel ori'.split() decoder = (decode, _get_decoder(decode)) if source in ['eng', 'urd']: if target not in indic: raise NotImplementedError( 'Language pair `%s-%s` is not implemented.' % (o_src, o_trg)) if source == 'eng': ru2i_trans = Rom2Ind(source, target, decoder, build_lookup) else: ru2i_trans = Urd2Ind(source, target, decoder, build_lookup) if decode == 'viterbi': self.transform = ru2i_trans.transliterate else: self.transform = ru2i_trans.top_n_trans elif target in ['eng', 'urd']: if source not in indic: raise NotImplementedError( 'Language pair `%s-%s` is not implemented.' % (o_src, o_trg)) i2o_trans = Ind2RU(source, target, decoder, build_lookup) if decode == 'viterbi': self.transform = i2o_trans.transliterate else: self.transform = i2o_trans.top_n_trans else: if source not in indic or target not in indic or source == target: raise NotImplementedError( 'Language pair `%s-%s` is not implemented.' % (o_src, o_trg)) i2i_trans = Ind2Ind(source, target) self.transform = i2i_trans.rtrans def convert(self, line): return self.transform(line)