Source code for indictrans.base

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright (C) 2015 Irshad Ahmad Bhat

"""
Transliteration Tool:
Indic to Roman transliterator
"""
from __future__ import unicode_literals

import io
import re
import json
import os.path

import numpy as np
from scipy.sparse import issparse

from ._utils import WX, OneHotEncoder, UrduNormalizer


[docs]class BaseTransliterator(object): """Base class for transliterator. Attributes ---------- vectorizer_ : instance `OneHotEncoder` instance for converting categorical features to one-hot features. classes_ : dict Dictionary of set of tags with unique ids ({id: tag}). coef_ : array HMM coefficient array intercept_init_ : array HMM intercept array for first layer of trellis. intercept_trans_ : array HMM intercept/transition array for middle layers of trellis. intercept_final_ : array HMM intercept array for last layer of trellis. wx_process : method `wx2utf`/`utf2wx` method of `WX` instance nu : instance `UrduNormalizer` instance for normalizing Urdu scripts. """ def __init__(self, source, target, decoder, build_lookup=False): self.source = source self.target = target self.lookup = dict() self.build_lookup = build_lookup self.decode, self.decoder = decoder self.tab = '\x01\x03' # mask tabs self.space = '\x02\x04' # mask spaces self.esc_ch = '\x00' # escape-sequence for Roman in WX self.dist_dir = os.path.dirname(os.path.abspath(__file__)) self.base_fit()
[docs] def load_models(self): """Loads transliteration models.""" self.vectorizer_ = OneHotEncoder() model = '%s-%s' % (self.source, self.target) with open('%s/models/%s/sparse.vec' % (self.dist_dir, model)) as jfp: self.vectorizer_.unique_feats = json.load(jfp) self.classes_ = np.load( '%s/models/%s/classes.npy' % (self.dist_dir, model), encoding='latin1')[0] self.coef_ = np.load( '%s/models/%s/coef.npy' % (self.dist_dir, model), encoding='latin1')[0].astype(np.float64) self.intercept_init_ = np.load( '%s/models/%s/intercept_init.npy' % (self.dist_dir, model), encoding='latin1').astype(np.float64) self.intercept_trans_ = np.load( '%s/models/%s/intercept_trans.npy' % (self.dist_dir, model), encoding='latin1').astype(np.float64) self.intercept_final_ = np.load( '%s/models/%s/intercept_final.npy' % (self.dist_dir, model), encoding='latin1').astype(np.float64)
def load_mappings(self): # initialize punctuation map table self.punkt_tbl = dict() with io.open('%s/mappings/punkt.map' % self.dist_dir, encoding='utf-8') as punkt_fp: for line in punkt_fp: s, t = line.split() if self.target == 'urd': if s in ["'", '"']: continue self.punkt_tbl[ord(s)] = t else: self.punkt_tbl[ord(t)] = s def base_fit(self): # load models self.load_models() # load mapping tables for Urdu if 'urd' in [self.source, self.target]: self.load_mappings() # initialize Urdu Normalizer if self.source == 'urd': self.nu = UrduNormalizer() # initialize wx-converter and character-maps if self.source in ['eng', 'urd']: wxp = WX(order='wx2utf', lang=self.target) self.wx_process = wxp.wx2utf else: wxp = WX(order='utf2wx', lang=self.source) self.wx_process = wxp.utf2wx self.mask_roman = re.compile(r'([a-zA-Z]+)')
[docs] def predict(self, word, k_best=5): """Given encoded word matrix and HMM parameters, predicts output sequence (target word)""" X = self.vectorizer_.transform(word) if issparse(X): scores = X.dot(self.coef_.T).toarray() else: scores = self.coef_.dot(X.T).T if self.decode == 'viterbi': y = self.decoder.decode( scores, self.intercept_trans_, self.intercept_init_, self.intercept_final_) y = [self.classes_[pid].decode('utf-8') for pid in y] y = ''.join(y).replace('_', '') return y else: top_seq = list() y = self.decoder.decode( scores, self.intercept_trans_, self.intercept_init_, self.intercept_final_, k_best) for path in y: w = [self.classes_[pid].decode('utf-8') for pid in path] w = ''.join(w).replace('_', '') top_seq.append(w) return top_seq
[docs] def convert_to_wx(self, text): """Converts Indic scripts to WX.""" if self.source == 'eng': return text.lower() if self.source == 'urd': return self.nu.normalize(text) if self.source == 'ben': # Assamese `ra` to Bengali `ra` text = text.replace('\u09f0', '\u09b0') # Assamese `va` to Bengali `va` text = text.replace('\u09f1', '\u09ac') text = self.mask_roman.sub(r'%s\1' % (self.esc_ch), text) text = self.wx_process(text) return text
[docs] def transliterate(self, text, k_best=None): """Single best transliteration using viterbi decoding.""" trans_list = [] text = self.convert_to_wx(text) text = text.replace('\t', self.tab) text = text.replace(' ', self.space) lines = text.split("\n") for line in lines: if not line.strip(): trans_list.append(line) continue trans_line = str() line = self.non_alpha.split(line) for word in line: trans_line += self.case_trans(word) trans_list.append(trans_line) trans_line = '\n'.join(trans_list) trans_line = trans_line.replace(self.space, ' ') trans_line = trans_line.replace(self.tab, '\t') return trans_line
[docs] def top_n_trans(self, text, k_best=5): """Returns k-best transliterations using beamsearch decoding. Parameters ---------- k_best : int, default: 5, optional Used by `Beamsearch` decoder to return k-best transliterations. """ if k_best < 2: raise ValueError('`k_best` value should be >= 2') trans_word = [] text = self.convert_to_wx(text) words = self.non_alpha.split(text) for word in words: op_word = self.case_trans(word, k_best) if isinstance(op_word, list): trans_word.append(op_word) else: trans_word.append([word] * k_best) return [''.join(w) for w in zip(*trans_word)]