1
+ #-*- coding: utf-8 -*-
2
+
3
+ import re
4
+ from Tacotron .text import cleaners
5
+ from Tacotron .text .symbols import symbols
6
+
7
+
8
+
9
+
10
+ # Mappings from symbol to numeric ID and vice versa:
11
+ _symbol_to_id = {s : i for i , s in enumerate (symbols )}
12
+ _id_to_symbol = {i : s for i , s in enumerate (symbols )}
13
+
14
+ # Regular expression matching text enclosed in curly braces:
15
+ _curly_re = re .compile (r'(.*?)\{(.+?)\}(.*)' )
16
+
17
+
18
+ def text_to_sequence (text , cleaner_names ):
19
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
20
+
21
+ The text can optionally have ARPAbet sequences enclosed in curly braces embedded
22
+ in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
23
+
24
+ Args:
25
+ text: string to convert to a sequence
26
+ cleaner_names: names of the cleaner functions to run the text through
27
+
28
+ Returns:
29
+ List of integers corresponding to the symbols in the text
30
+ '''
31
+ sequence = []
32
+
33
+ # Check for curly braces and treat their contents as ARPAbet:
34
+ while len (text ):
35
+ m = _curly_re .match (text )
36
+ if not m :
37
+ sequence += _symbols_to_sequence (_clean_text (text , cleaner_names ))
38
+ break
39
+ sequence += _symbols_to_sequence (_clean_text (m .group (1 ), cleaner_names ))
40
+ sequence += _arpabet_to_sequence (m .group (2 ))
41
+ text = m .group (3 )
42
+
43
+ # Append EOS token
44
+ sequence .append (_symbol_to_id ['~' ])
45
+ return sequence
46
+
47
+
48
+ def sequence_to_text (sequence ):
49
+ '''Converts a sequence of IDs back to a string'''
50
+ result = ''
51
+ for symbol_id in sequence :
52
+ if symbol_id in _id_to_symbol :
53
+ s = _id_to_symbol [symbol_id ]
54
+ # Enclose ARPAbet back in curly braces:
55
+ if len (s ) > 1 and s [0 ] == '@' :
56
+ s = '{%s}' % s [1 :]
57
+ result += s
58
+ return result .replace ('}{' , ' ' )
59
+
60
+
61
+ def _clean_text (text , cleaner_names ):
62
+ for name in cleaner_names :
63
+ cleaner = getattr (cleaners , name )
64
+ if not cleaner :
65
+ raise Exception ('Unknown cleaner: %s' % name )
66
+ text = cleaner (text )
67
+ return text
68
+
69
+
70
+ def _symbols_to_sequence (symbols ):
71
+ return [_symbol_to_id [s ] for s in symbols if _should_keep_symbol (s )]
72
+
73
+
74
+ def _arpabet_to_sequence (text ):
75
+ return _symbols_to_sequence (['@' + s for s in text .split ()])
76
+
77
+
78
+ def _should_keep_symbol (s ):
79
+ return s in _symbol_to_id and s is not '_' and s is not '~'
0 commit comments