import re, random import nltk.corpus.cmudict # Read in a file and build a bigram language model out of it # The language model is structured as a dictionary # The keys of the dictionary are the first phones of the bigrams # The entries in the dictionary are in turn dictionaries # These dictionaries are keyed on the second phone # Their entries are the probabilities of the respective bigrams # # For example: # # model = buildBigramPhoneModel('ttlg.txt') # # model['NG'] # {'#': 0.80235121234386475, 'G': 0.039676708302718591, 'K': 0.087435709037472442, 'L': 0.0014695077149155032, 'IH': 0.0095518001469507719, 'T': 0.0007347538574577516, 'Z': 0.058780308596620132} # # model['TH']['IH'] # 0.41323792486583183 def buildBigramPhoneModel(corpus): frequencies = buildUnigramWordFrequencies(corpus) prondict = nltk.corpus.cmudict.dictionary() bigrams = {} phoneFreq = {} for (word, count) in frequencies.items(): if prondict.has_key(word.upper()): # CMUDict is keyed on uppercase words and has a primary pronunciation as the first entry pronunciation = prondict[word.upper()][0] # There are numerical markers for stress--we strip them out pronunciation = [re.sub('[0-9]', '', ph) for ph in pronunciation] # This leaves us with a list of Arpabet phones for the pronunciation of the word # Start with word-start token as first part of bigram prevPhone = '#' for phone in pronunciation: ### FIXME # Update phoneFreq and bigrams dictionaries # for prevWord and bigram # Keep in mind comments on buildUnigramWordFrequencies() on how this would differ from before # (it isn't a big change) prevPhone = phone # Done with word, word-end now second part of bigram phone = '#' ### FIXME # need to do this again for the last phone in a pronunciation # otherwise word-final phones wouldn't be counted for the first part of bigrams... # (so do something very similar to inside the loop) # transform bigram counts into bigram probabilities for firstPhone in bigrams.keys(): for secondPhone in bigrams[firstPhone].keys(): ### FIXME # similar transformation from frequencies into probabilities return bigrams # Find word frequencies # Why is this useful? # If given a word and its frequency n, its contribution to phone bigrams is thus # the frequency of the bigram in the word times the frequency of the word. def buildUnigramWordFrequencies(fn): fh = open(fn) # to store word frequencies freq = {} # iterate over the file for text in fh: # same transformation as before text = re.sub('[`!@#$%\^&*()_+-={}|\[\]\\:";<>?,./]', " ", text) text = text.lower() text = re.sub("\s+", " ", text) text = text.lstrip() text = text.rstrip() # iterate over words for word in text.split(): # need to test if word is in dictionary # if it isn't, trying to access it is an error! if freq.has_key(word): freq[word] = freq[word] + 1 else: freq[word] = 1 return freq # Take in a bigram entry dictionary (as defined above) and generate a random new phone based on it def generateRandomPhone(probDict): ### FIXME ### Generate random phone just like generating a random word # Generate a random word based on an existing language model # Returns the word def generateRandomWordFromModel(model): word = [] # Start with a word-start token as the first part of the bigram last = '#' # Infinitely loop while True: last = generateRandomPhone(model[last]) # If we've reached a word-end token, stop the infinite loop if last == '#': break else: word.append(last) return ' '.join(word) # Given the filename of a textfile, # generate random word based on the probability distribution of phones in that text # (with a bigram language model) # Returns the word def generateRandomWord(corpus): model = buildBigramPhoneModel(corpus) return generateRandomWordFromModel(model) # Create n nonce words in sequence # Does not use generateRandomWord() so that the language model need only be created once # Returns a list of words def generateRandomWords(corpus, n=10): model = buildBigramPhoneModel(corpus) words = [] # loop until number of words created for i in range(n): words.append(generateRandomWordFromModel(model)) return words