import nltk # Download the required POS tagger nltk.download('averaged_perceptron_tagger_eng') nltk.download('cmudict') # also useful for g2p-en from g2p_en import G2p # Initialize g2p g2p = G2p() def safe_g2p(text: str): try: return g2p(text) except Exception as e: # fallback: remove digits and retry cleaned = re.sub(r"\d+", "", text) return g2p(cleaned) import re def clean_text(text): # Keep letters, numbers, spaces, and apostrophes return re.sub(r"[^a-zA-Z0-9' ]+", "", text) def clean_cmu(text): res = text.replace("0", "").replace("1", "").replace("2", "").replace("-", "").strip() res = res.lower() return res CMU_TO_IPA = { # Vowels "AA": "ɑ", # odd "AE": "æ", # at "AH": "ʌ", # hut "AH0": "ə", # about (unstressed) "AO": "ɔ", # ought, story "AW": "aʊ", # cow "AY": "aɪ", # hide "EH": "ɛ", # Ed "ER": "ɝ", # stressed "ur", hurt "ER0": "ɚ", # unstressed "ər" "EY": "eɪ", # ate "IH": "ɪ", # it "IY": "i", # eat "OW": "oʊ", # oat "OY": "ɔɪ", # toy "UH": "ʊ", # hood "UW": "u", # two # Consonants "B": "b", "CH": "tʃ", "D": "d", "DH": "ð", "F": "f", "G": "ɡ", "HH": "h", "JH": "dʒ", "K": "k", "L": "l", "M": "m", "N": "n", "NG": "ŋ", "P": "p", "R": "r", "S": "s", "SH": "ʃ", "T": "t", "TH": "θ", "V": "v", "W": "w", "Y": "j", "Z": "z", "ZH": "ʒ", } def cmu_to_ipa(cmu_sentence: str) -> str: """ Greedy match CMUdict/ARPAbet phoneme sequence into IPA. - Try 2-character tokens first. - Fallback to 1-character tokens. Example: "DAWN T MEYK" -> "daʊn t meɪk" """ ipa_tokens = [] words = cmu_sentence.strip().split() for word in words: i = 0 while i < len(word): # Try 2-char match if i + 2 <= len(word) and word[i:i+2].upper() in CMU_TO_IPA: ipa_tokens.append(CMU_TO_IPA[word[i:i+2].upper()]) i += 2 # Try 1-char match elif word[i].upper() in CMU_TO_IPA: ipa_tokens.append(CMU_TO_IPA[word[i].upper()]) i += 1 else: # fallback: keep as lowercase character ipa_tokens.append(word[i].lower()) i += 1 ipa_tokens.append(" ") return "".join(ipa_tokens) # join chars without extra spaces def text_to_phoneme(text): phonemes = safe_g2p(clean_text(text)) res = "".join(phonemes) res = clean_cmu(res) return res