#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright (C) 2015 Irshad Ahmad Bhat
"""
Transliteration Tool:
Indic to Roman transliterator
"""
from __future__ import unicode_literals
import io
import re
import json
import os.path
import numpy as np
from scipy.sparse import issparse
from ._utils import WX, OneHotEncoder, UrduNormalizer
[docs]class BaseTransliterator(object):
"""Base class for transliterator.
Attributes
----------
vectorizer_ : instance
`OneHotEncoder` instance for converting categorical features to
one-hot features.
classes_ : dict
Dictionary of set of tags with unique ids ({id: tag}).
coef_ : array
HMM coefficient array
intercept_init_ : array
HMM intercept array for first layer of trellis.
intercept_trans_ : array
HMM intercept/transition array for middle layers of trellis.
intercept_final_ : array
HMM intercept array for last layer of trellis.
wx_process : method
`wx2utf`/`utf2wx` method of `WX` instance
nu : instance
`UrduNormalizer` instance for normalizing Urdu scripts.
"""
def __init__(self, source, target, decoder, build_lookup=False):
self.source = source
self.target = target
self.lookup = dict()
self.build_lookup = build_lookup
self.decode, self.decoder = decoder
self.tab = '\x01\x03' # mask tabs
self.space = '\x02\x04' # mask spaces
self.esc_ch = '\x00' # escape-sequence for Roman in WX
self.dist_dir = os.path.dirname(os.path.abspath(__file__))
self.base_fit()
[docs] def load_models(self):
"""Loads transliteration models."""
self.vectorizer_ = OneHotEncoder()
model = '%s-%s' % (self.source, self.target)
with open('%s/models/%s/sparse.vec' % (self.dist_dir, model)) as jfp:
self.vectorizer_.unique_feats = json.load(jfp)
self.classes_ = np.load(
'%s/models/%s/classes.npy' %
(self.dist_dir, model),
encoding='latin1')[0]
self.coef_ = np.load(
'%s/models/%s/coef.npy' % (self.dist_dir, model),
encoding='latin1')[0].astype(np.float64)
self.intercept_init_ = np.load(
'%s/models/%s/intercept_init.npy' %
(self.dist_dir, model),
encoding='latin1').astype(np.float64)
self.intercept_trans_ = np.load(
'%s/models/%s/intercept_trans.npy' %
(self.dist_dir, model),
encoding='latin1').astype(np.float64)
self.intercept_final_ = np.load(
'%s/models/%s/intercept_final.npy' %
(self.dist_dir, model),
encoding='latin1').astype(np.float64)
def load_mappings(self):
# initialize punctuation map table
self.punkt_tbl = dict()
with io.open('%s/mappings/punkt.map' % self.dist_dir,
encoding='utf-8') as punkt_fp:
for line in punkt_fp:
s, t = line.split()
if self.target == 'urd':
if s in ["'", '"']:
continue
self.punkt_tbl[ord(s)] = t
else:
self.punkt_tbl[ord(t)] = s
def base_fit(self):
# load models
self.load_models()
# load mapping tables for Urdu
if 'urd' in [self.source, self.target]:
self.load_mappings()
# initialize Urdu Normalizer
if self.source == 'urd':
self.nu = UrduNormalizer()
# initialize wx-converter and character-maps
if self.source in ['eng', 'urd']:
wxp = WX(order='wx2utf', lang=self.target)
self.wx_process = wxp.wx2utf
else:
wxp = WX(order='utf2wx', lang=self.source)
self.wx_process = wxp.utf2wx
self.mask_roman = re.compile(r'([a-zA-Z]+)')
[docs] def predict(self, word, k_best=5):
"""Given encoded word matrix and HMM parameters, predicts output
sequence (target word)"""
X = self.vectorizer_.transform(word)
if issparse(X):
scores = X.dot(self.coef_.T).toarray()
else:
scores = self.coef_.dot(X.T).T
if self.decode == 'viterbi':
y = self.decoder.decode(
scores,
self.intercept_trans_,
self.intercept_init_,
self.intercept_final_)
y = [self.classes_[pid].decode('utf-8') for pid in y]
y = ''.join(y).replace('_', '')
return y
else:
top_seq = list()
y = self.decoder.decode(
scores,
self.intercept_trans_,
self.intercept_init_,
self.intercept_final_,
k_best)
for path in y:
w = [self.classes_[pid].decode('utf-8') for pid in path]
w = ''.join(w).replace('_', '')
top_seq.append(w)
return top_seq
[docs] def convert_to_wx(self, text):
"""Converts Indic scripts to WX."""
if self.source == 'eng':
return text.lower()
if self.source == 'urd':
return self.nu.normalize(text)
if self.source == 'ben':
# Assamese `ra` to Bengali `ra`
text = text.replace('\u09f0', '\u09b0')
# Assamese `va` to Bengali `va`
text = text.replace('\u09f1', '\u09ac')
text = self.mask_roman.sub(r'%s\1' % (self.esc_ch), text)
text = self.wx_process(text)
return text
[docs] def transliterate(self, text, k_best=None):
"""Single best transliteration using viterbi decoding."""
trans_list = []
text = self.convert_to_wx(text)
text = text.replace('\t', self.tab)
text = text.replace(' ', self.space)
lines = text.split("\n")
for line in lines:
if not line.strip():
trans_list.append(line)
continue
trans_line = str()
line = self.non_alpha.split(line)
for word in line:
trans_line += self.case_trans(word)
trans_list.append(trans_line)
trans_line = '\n'.join(trans_list)
trans_line = trans_line.replace(self.space, ' ')
trans_line = trans_line.replace(self.tab, '\t')
return trans_line
[docs] def top_n_trans(self, text, k_best=5):
"""Returns k-best transliterations using beamsearch decoding.
Parameters
----------
k_best : int, default: 5, optional
Used by `Beamsearch` decoder to return k-best transliterations.
"""
if k_best < 2:
raise ValueError('`k_best` value should be >= 2')
trans_word = []
text = self.convert_to_wx(text)
words = self.non_alpha.split(text)
for word in words:
op_word = self.case_trans(word, k_best)
if isinstance(op_word, list):
trans_word.append(op_word)
else:
trans_word.append([word] * k_best)
return [''.join(w) for w in zip(*trans_word)]