#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2016 Irshad Ahmad Bhat
from ._decode import DECODERS
from .script_transliterate import (Ind2RU, Rom2Ind,
Urd2Ind, Ind2Ind)
def _get_decoder(decode):
try:
return DECODERS[decode]
except KeyError:
raise ValueError('Unknown decoder {0!r}'.format(decode))
[docs]class Transliterator():
"""Transliterator for Indic scripts including English and Urdu.
Parameters
----------
source : str, default: hin
Source Language (3 letter ISO-639 code)
target : str, default: eng
Target Language (3 letter ISO-639 code)
decode : str, default: viterbi
Decoding algorithm, either "viterbi" or "beamsearch".
build_lookup : bool, default: False
Flag to build lookup-table. Fastens the transliteration
process if the input text contains repeating words.
Examples
--------
>>> from indictrans import Transliterator
>>> trn = Transliterator(source='hin', target='eng', build_lookup=True)
>>> hin = '''कांग्रेस पार्टी अध्यक्ष सोनिया गांधी, तमिलनाडु की मुख्यमंत्री
... जयललिता और रिज़र्व बैंक के गवर्नर रघुराम राजन के बीच एक
... समानता है. ये सभी अलग-अलग कारणों से भारतीय जनता पार्टी के
... राज्यसभा सांसद सुब्रमण्यम स्वामी के निशाने पर हैं. उनके
... जयललिता और सोनिया गांधी के पीछे पड़ने का कारण कथित
... भ्रष्टाचार है.'''
>>> eng = trn.transform(hin)
>>> print(eng)
congress party adhyaksh sonia gandhi, tamilnadu kii mukhyamantri
jayalalita our reserve baink ke governor raghuram rajan ke beech ek
samanta hai. ye sabi alag-alag carnon se bharatiya janata party ke
rajyasabha saansad subramanyam swami ke nishane par hain. unke
jayalalita our sonia gandhi ke peeche padane ka kaaran kathith
bhrashtachar hai.
"""
def __init__(self, source='hin', target='eng',
decode='viterbi', build_lookup=False):
source = source.lower()
target = target.lower()
o_src, o_trg = source, target
if source in ('mar', 'nep', 'kok', 'bod'):
source = 'hin'
elif source == 'asm':
source = 'ben'
if target in ('mar', 'nep', 'kok', 'bod'):
target = 'hin'
elif target == 'asm':
target = 'ben'
indic = 'hin guj pan ben mal kan tam tel ori'.split()
decoder = (decode, _get_decoder(decode))
if source in ['eng', 'urd']:
if target not in indic:
raise NotImplementedError(
'Language pair `%s-%s` is not implemented.' %
(o_src, o_trg))
if source == 'eng':
ru2i_trans = Rom2Ind(source, target, decoder, build_lookup)
else:
ru2i_trans = Urd2Ind(source, target, decoder, build_lookup)
if decode == 'viterbi':
self.transform = ru2i_trans.transliterate
else:
self.transform = ru2i_trans.top_n_trans
elif target in ['eng', 'urd']:
if source not in indic:
raise NotImplementedError(
'Language pair `%s-%s` is not implemented.' %
(o_src, o_trg))
i2o_trans = Ind2RU(source, target, decoder, build_lookup)
if decode == 'viterbi':
self.transform = i2o_trans.transliterate
else:
self.transform = i2o_trans.top_n_trans
else:
if source not in indic or target not in indic or source == target:
raise NotImplementedError(
'Language pair `%s-%s` is not implemented.' %
(o_src, o_trg))
i2i_trans = Ind2Ind(source, target)
self.transform = i2i_trans.rtrans
def convert(self, line):
return self.transform(line)