mirror of
https://github.com/signalwire/freeswitch.git
synced 2025-04-23 11:36:09 +00:00
speech recognition demo in python
git-svn-id: http://svn.freeswitch.org/svn/freeswitch/trunk@5659 d0543943-73ff-0310-b7d9-9358b9ac24b2
This commit is contained in:
parent
88bbc489d1
commit
b3454792b1
1
scripts/py_modules/__init__.py
Normal file
1
scripts/py_modules/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
# empty
|
198
scripts/py_modules/speechtools.py
Normal file
198
scripts/py_modules/speechtools.py
Normal file
@ -0,0 +1,198 @@
|
|||||||
|
from freeswitch import *
|
||||||
|
from xml.dom import minidom
|
||||||
|
|
||||||
|
VOICE_ENGINE = "cepstral"
|
||||||
|
VOICE = "William"
|
||||||
|
|
||||||
|
"""
|
||||||
|
A few classes that make it easier to write speech applications
|
||||||
|
using Python. It is roughly modelled after the equivalent that
|
||||||
|
is written in JavaScript.
|
||||||
|
|
||||||
|
Status: should work, but not yet complete. some pending items
|
||||||
|
are mentioned in comments
|
||||||
|
"""
|
||||||
|
|
||||||
|
class Grammar:
|
||||||
|
def __init__(self, name, path, obj_path,
|
||||||
|
min_score=1, confirm_score=400, halt=False):
|
||||||
|
"""
|
||||||
|
@param name - name of grammar to reference it later
|
||||||
|
@param path - path to xml grammar file
|
||||||
|
@param obj_path - xml path to find interpretation from root
|
||||||
|
in result xml, eg, 'interpretation'
|
||||||
|
@param min_score - score threshold to accept result
|
||||||
|
@param confirm_score - if score below this threshold, ask user
|
||||||
|
if they are sure this is correct
|
||||||
|
@param halt - not sure what was used for in js, currently unused
|
||||||
|
"""
|
||||||
|
self.name=name
|
||||||
|
self.path=path
|
||||||
|
self.obj_path=obj_path
|
||||||
|
self.min_score=min_score
|
||||||
|
self.confirm_score=confirm_score
|
||||||
|
self.halt=halt
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechDetect:
|
||||||
|
|
||||||
|
def __init__(self, session, module_name, ip_addr):
|
||||||
|
self.session=session
|
||||||
|
self.module_name=module_name
|
||||||
|
self.ip_addr=ip_addr
|
||||||
|
self.grammars = {}
|
||||||
|
|
||||||
|
def addGrammar(self, grammar):
|
||||||
|
self.grammars[grammar.name]=grammar
|
||||||
|
|
||||||
|
def setGrammar(self, name):
|
||||||
|
self.grammar = self.grammars[name]
|
||||||
|
|
||||||
|
def detectSpeech(self):
|
||||||
|
# TODO: we might not always want to call detect_speech
|
||||||
|
# with this cmd, see js version for other options
|
||||||
|
# also see detect_speech_function() in mod_dptools.c
|
||||||
|
cmd = "%s %s %s %s" % (self.module_name,
|
||||||
|
self.grammar.name,
|
||||||
|
self.grammar.path,
|
||||||
|
self.ip_addr)
|
||||||
|
console_log("debug", "calling detect_speech with: %s\n" % cmd)
|
||||||
|
self.session.execute("detect_speech", cmd)
|
||||||
|
console_log("debug", "finished calling detect_speech\n")
|
||||||
|
|
||||||
|
class SpeechObtainer:
|
||||||
|
|
||||||
|
def __init__(self, speech_detect, required_phrases, wait_time, max_tries):
|
||||||
|
"""
|
||||||
|
@param speech_detect - the speech detect object, which holds a
|
||||||
|
reference to underlying session and can
|
||||||
|
be re-used by many SpeechObtainers
|
||||||
|
@param required_phrases - the number of required phrases from the
|
||||||
|
grammar. for example if its prompting for
|
||||||
|
the toppings on a sandwhich and min toppings
|
||||||
|
is 3, use 3. normally will be 1.
|
||||||
|
@param wait_time - the time, in millisconds, to wait for
|
||||||
|
input during each loop iteration
|
||||||
|
@param max_tries - this number multiplied by wait time gives the
|
||||||
|
'total wait time' before we give up and return
|
||||||
|
partial or no result
|
||||||
|
"""
|
||||||
|
self.speech_detect=speech_detect
|
||||||
|
self.required_phrases=required_phrases
|
||||||
|
self.wait_time=wait_time
|
||||||
|
self.max_tries=max_tries
|
||||||
|
|
||||||
|
self.detected_phrases = []
|
||||||
|
|
||||||
|
def setGrammar(self, grammar):
|
||||||
|
"""
|
||||||
|
@param grammar - instance of grammar class
|
||||||
|
"""
|
||||||
|
self.grammar=grammar
|
||||||
|
self.speech_detect.addGrammar(grammar)
|
||||||
|
self.speech_detect.setGrammar(self.grammar.name)
|
||||||
|
|
||||||
|
def detectSpeech(self):
|
||||||
|
self.speech_detect.detectSpeech()
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""
|
||||||
|
start speech detection with the current grammar,
|
||||||
|
and listen for results from asr engine. once a result
|
||||||
|
has been returned, return it to caller
|
||||||
|
"""
|
||||||
|
|
||||||
|
def dtmf_handler(input, itype, funcargs):
|
||||||
|
console_log("INFO","\n\nDTMF itype: %s\n" % itype)
|
||||||
|
if itype == 1: # TODO!! use names for comparison instead of number
|
||||||
|
return self.handle_event(input, funcargs)
|
||||||
|
elif itype== 0:
|
||||||
|
console_log("INFO","\n\nDTMF input: %s\n" % input)
|
||||||
|
else:
|
||||||
|
console_log("INFO","\n\nUnknown input type: %s\n" % itype)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
num_tries = 0
|
||||||
|
|
||||||
|
session = self.speech_detect.session
|
||||||
|
|
||||||
|
console_log("debug", "setting dtmf callback\n")
|
||||||
|
session.setDTMFCallback(dtmf_handler, "")
|
||||||
|
console_log("debug", "calling getDigits\n")
|
||||||
|
|
||||||
|
console_log("debug", "starting run() while loop\n")
|
||||||
|
while (session.ready() and
|
||||||
|
num_tries < self.max_tries and
|
||||||
|
len(self.detected_phrases) < self.required_phrases):
|
||||||
|
console_log("debug", "top of run() while loop\n")
|
||||||
|
session.collectDigits(self.wait_time)
|
||||||
|
num_tries += 1
|
||||||
|
|
||||||
|
console_log("debug", "while loop finished\n")
|
||||||
|
return self.detected_phrases
|
||||||
|
|
||||||
|
def handle_event(self, event, funcargs):
|
||||||
|
"""
|
||||||
|
when the dtmf handler receives an event, it calls back
|
||||||
|
this method. event is a dictionary with subdictionaries ..
|
||||||
|
|
||||||
|
Example 1
|
||||||
|
=========
|
||||||
|
|
||||||
|
{'body': None, 'headers': {'Speech-Type': 'begin-speaking'}}
|
||||||
|
|
||||||
|
Example 2
|
||||||
|
=========
|
||||||
|
{'body': '<result xmlns='http://www.ietf.org/xml/ns/mrcpv2'
|
||||||
|
xmlns:ex='http://www.example.com/example' score='100'
|
||||||
|
grammar='session:request1@form-level.store'><interpretation>
|
||||||
|
<input mode='speech'>waffles</input></interpretation></result>',
|
||||||
|
'headers': {'Speech-Type': 'detected-speech'}}
|
||||||
|
|
||||||
|
This dictionary is constructed in run_dtmf_callback() in
|
||||||
|
freeswitch_python.cpp
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
# what kind of event?
|
||||||
|
headers = event['headers']
|
||||||
|
speech_type = headers['Speech-Type']
|
||||||
|
if speech_type == "begin-speaking":
|
||||||
|
# not sure what to do with this, try returning "stop"
|
||||||
|
# so that it might stop playing a sound file once
|
||||||
|
# speech has been detected
|
||||||
|
return "stop"
|
||||||
|
elif speech_type == "detected-speech":
|
||||||
|
# extract the detected phrase. from result
|
||||||
|
# BUG: this assumes only ONE interpretation in the xml
|
||||||
|
# result. rest will get igored
|
||||||
|
# NOTE: have to wrap everything with str() (at least
|
||||||
|
# calls to console_log because otherwise it chokes on
|
||||||
|
# unicode strings.
|
||||||
|
# TODO: check the score
|
||||||
|
body = event['body']
|
||||||
|
dom = minidom.parseString(body)
|
||||||
|
phrase = dom.getElementsByTagName(self.grammar.obj_path)[0]
|
||||||
|
phrase_text = self.getText(phrase)
|
||||||
|
if phrase_text:
|
||||||
|
self.detected_phrases.append(str(phrase_text))
|
||||||
|
# do we want to return stop? what should we return?
|
||||||
|
return "stop"
|
||||||
|
else:
|
||||||
|
raise Exception("Unknown speech event: %s" % speech_type)
|
||||||
|
|
||||||
|
|
||||||
|
def getText(self, elt):
|
||||||
|
|
||||||
|
""" given an element, get its text. if there is more than
|
||||||
|
one text node child, just append all the text together.
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = ""
|
||||||
|
children = elt.childNodes
|
||||||
|
for child in children:
|
||||||
|
if child.nodeType == child.TEXT_NODE:
|
||||||
|
result += str(child.nodeValue)
|
||||||
|
return result
|
||||||
|
|
87
scripts/recipewizard.py
Normal file
87
scripts/recipewizard.py
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
from freeswitch import *
|
||||||
|
from py_modules.speechtools import Grammar, SpeechDetect
|
||||||
|
from py_modules.speechtools import SpeechObtainer
|
||||||
|
|
||||||
|
import time, os
|
||||||
|
|
||||||
|
VOICE_ENGINE = "cepstral"
|
||||||
|
VOICE = "William"
|
||||||
|
GRAMMAR_ROOT = "/usr/src/freeswitch_trunk/scripts"
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example speech recognition application in python.
|
||||||
|
|
||||||
|
How to make this work:
|
||||||
|
|
||||||
|
* Get mod_openmrcp working along with an MRCP asr server
|
||||||
|
* Add /usr/src/freeswitch/scripts or equivalent to your PYTHONPATH
|
||||||
|
* Restart freeswitch
|
||||||
|
* Create $GRAMMAR_ROOT/mainmenu.xml from contents in mainmenu() comments
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
class RecipeWizard:
|
||||||
|
|
||||||
|
def __init__(self, session):
|
||||||
|
self.session=session
|
||||||
|
self.session.set_tts_parms(VOICE_ENGINE, VOICE)
|
||||||
|
self.main()
|
||||||
|
|
||||||
|
def main(self):
|
||||||
|
|
||||||
|
console_log("debug", "recipe wizard main()\n")
|
||||||
|
self.speechdetect = SpeechDetect(self.session, "openmrcp", "127.0.0.1");
|
||||||
|
self.speechobtainer = SpeechObtainer(speech_detect=self.speechdetect,
|
||||||
|
required_phrases=1,
|
||||||
|
wait_time=5000,
|
||||||
|
max_tries=3)
|
||||||
|
gfile = os.path.join(GRAMMAR_ROOT, "mainmenu.xml")
|
||||||
|
self.grammar = Grammar("mainmenu", gfile,"input",80,90)
|
||||||
|
self.speechobtainer.setGrammar(self.grammar);
|
||||||
|
console_log("debug", "calling speechobtainer.run()\n")
|
||||||
|
self.speechobtainer.detectSpeech()
|
||||||
|
self.session.speak("Hello. Welcome to the recipe wizard. Drinks or food?")
|
||||||
|
result = self.speechobtainer.run()
|
||||||
|
console_log("debug", "speechobtainer.run() result: %s\n" % result)
|
||||||
|
if result:
|
||||||
|
self.session.speak("Received result. Result is: %s" % result[0])
|
||||||
|
else:
|
||||||
|
self.session.speak("Sorry, I did not hear you")
|
||||||
|
|
||||||
|
console_log("debug", "speechobtainer.run() finished\n")
|
||||||
|
|
||||||
|
def mainmenu():
|
||||||
|
"""
|
||||||
|
<!DOCTYPE grammar PUBLIC "-//W3C//DTD GRAMMAR 1.0//EN"
|
||||||
|
"http://www.w3.org/TR/speech-grammar/grammar.dtd">
|
||||||
|
|
||||||
|
<grammar xmlns="http://www.w3.org/2001/06/grammar" xml:lang="en"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||||
|
xsi:schemaLocation="http://www.w3.org/2001/06/grammar
|
||||||
|
http://www.w3.org/TR/speech-grammar/grammar.xsd"
|
||||||
|
version="1.0" mode="voice" root="root">
|
||||||
|
|
||||||
|
|
||||||
|
<rule id="root" scope="public">
|
||||||
|
|
||||||
|
<rule id="main">
|
||||||
|
<one-of>
|
||||||
|
<item weight="10">drinks</item>
|
||||||
|
<item weight="2">food</item>
|
||||||
|
</one-of>
|
||||||
|
</rule>
|
||||||
|
|
||||||
|
</rule>
|
||||||
|
|
||||||
|
</grammar>
|
||||||
|
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def handler(uuid):
|
||||||
|
session = PySession(uuid)
|
||||||
|
session.answer()
|
||||||
|
rw = RecipeWizard(session)
|
||||||
|
session.hangup("1")
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user