Skip to content

Commit 269c5e1

Browse files
committed
first commit
1 parent 421fbb8 commit 269c5e1

File tree

2 files changed

+80
-1
lines changed

2 files changed

+80
-1
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
*.pyc
22
.DS_Store
3-
__init__.py
3+
./__init__.py

text/__init__.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#-*- coding: utf-8 -*-
2+
3+
import re
4+
from Tacotron.text import cleaners
5+
from Tacotron.text.symbols import symbols
6+
7+
8+
9+
10+
# Mappings from symbol to numeric ID and vice versa:
11+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
12+
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
13+
14+
# Regular expression matching text enclosed in curly braces:
15+
_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
16+
17+
18+
def text_to_sequence(text, cleaner_names):
19+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
20+
21+
The text can optionally have ARPAbet sequences enclosed in curly braces embedded
22+
in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
23+
24+
Args:
25+
text: string to convert to a sequence
26+
cleaner_names: names of the cleaner functions to run the text through
27+
28+
Returns:
29+
List of integers corresponding to the symbols in the text
30+
'''
31+
sequence = []
32+
33+
# Check for curly braces and treat their contents as ARPAbet:
34+
while len(text):
35+
m = _curly_re.match(text)
36+
if not m:
37+
sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
38+
break
39+
sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
40+
sequence += _arpabet_to_sequence(m.group(2))
41+
text = m.group(3)
42+
43+
# Append EOS token
44+
sequence.append(_symbol_to_id['~'])
45+
return sequence
46+
47+
48+
def sequence_to_text(sequence):
49+
'''Converts a sequence of IDs back to a string'''
50+
result = ''
51+
for symbol_id in sequence:
52+
if symbol_id in _id_to_symbol:
53+
s = _id_to_symbol[symbol_id]
54+
# Enclose ARPAbet back in curly braces:
55+
if len(s) > 1 and s[0] == '@':
56+
s = '{%s}' % s[1:]
57+
result += s
58+
return result.replace('}{', ' ')
59+
60+
61+
def _clean_text(text, cleaner_names):
62+
for name in cleaner_names:
63+
cleaner = getattr(cleaners, name)
64+
if not cleaner:
65+
raise Exception('Unknown cleaner: %s' % name)
66+
text = cleaner(text)
67+
return text
68+
69+
70+
def _symbols_to_sequence(symbols):
71+
return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
72+
73+
74+
def _arpabet_to_sequence(text):
75+
return _symbols_to_sequence(['@' + s for s in text.split()])
76+
77+
78+
def _should_keep_symbol(s):
79+
return s in _symbol_to_id and s is not '_' and s is not '~'

0 commit comments

Comments
 (0)