speech recognition demo in python

git-svn-id: http://svn.freeswitch.org/svn/freeswitch/trunk@5659 d0543943-73ff-0310-b7d9-9358b9ac24b2
2025-07-18 20:50:17 +00:00 · 2007-08-22 14:23:28 +00:00 · 2007-08-22 14:23:28 +00:00 · b3454792b1
commit b3454792b1
parent 88bbc489d1
3 changed files with 286 additions and 0 deletions
--- a/scripts/py_modules/init.py
+++ b/scripts/py_modules/init.py
@ -0,0 +1 @@
 # empty
--- a/scripts/py_modules/speechtools.py
+++ b/scripts/py_modules/speechtools.py
@ -0,0 +1,198 @@
 from freeswitch import *
 from xml.dom import minidom
 VOICE_ENGINE = "cepstral"
 VOICE = "William"
 """
 A few classes that make it easier to write speech applications
 using Python.  It is roughly modelled after the equivalent that
 is written in JavaScript.
 Status: should work, but not yet complete.  some pending items
 are mentioned in comments
 """
 class Grammar:
    def __init__(self, name, path, obj_path,
                 min_score=1, confirm_score=400, halt=False):
        """
        @param name - name of grammar to reference it later
        @param path - path to xml grammar file
        @param obj_path - xml path to find interpretation from root
                          in result xml, eg, 'interpretation'
        @param min_score - score threshold to accept result
        @param confirm_score - if score below this threshold, ask user
                               if they are sure this is correct
        @param halt - not sure what was used for in js, currently unused
        """
        self.name=name
        self.path=path
        self.obj_path=obj_path
        self.min_score=min_score
        self.confirm_score=confirm_score
        self.halt=halt
 class SpeechDetect:
    def __init__(self, session, module_name, ip_addr):
        self.session=session
        self.module_name=module_name
        self.ip_addr=ip_addr
        self.grammars = {}
    def addGrammar(self, grammar):
        self.grammars[grammar.name]=grammar
    def setGrammar(self, name):
        self.grammar = self.grammars[name]
    def detectSpeech(self):
        # TODO: we might not always want to call detect_speech
        # with this cmd, see js version for other options
        # also see detect_speech_function() in mod_dptools.c
        cmd = "%s %s %s %s" % (self.module_name,
                               self.grammar.name,
                               self.grammar.path,
                               self.ip_addr)
        console_log("debug", "calling detect_speech with: %s\n" % cmd)
        self.session.execute("detect_speech", cmd)
        console_log("debug", "finished calling detect_speech\n")
 class SpeechObtainer:
    def __init__(self, speech_detect, required_phrases, wait_time, max_tries):
        """
        @param speech_detect - the speech detect object, which holds a
                               reference to underlying session and can
                               be re-used by many SpeechObtainers
        @param required_phrases - the number of required phrases from the
                                  grammar.  for example if its prompting for
                                  the toppings on a sandwhich and min toppings
                                  is 3, use 3.  normally will be 1.
        @param wait_time - the time, in millisconds, to wait for
                           input during each loop iteration
        @param max_tries - this number multiplied by wait time gives the
                           'total wait time' before we give up and return
                           partial or no result
        """
        self.speech_detect=speech_detect
        self.required_phrases=required_phrases
        self.wait_time=wait_time
        self.max_tries=max_tries        
        self.detected_phrases = []
    def setGrammar(self, grammar):
        """
        @param grammar - instance of grammar class
        """
        self.grammar=grammar
        self.speech_detect.addGrammar(grammar)
        self.speech_detect.setGrammar(self.grammar.name)
    def detectSpeech(self):
        self.speech_detect.detectSpeech()
    def run(self):
        """
        start speech detection with the current grammar,
        and listen for results from asr engine.  once a result
        has been returned, return it to caller
        """
        def dtmf_handler(input, itype, funcargs):
            console_log("INFO","\n\nDTMF itype: %s\n" % itype)
            if itype == 1: # TODO!! use names for comparison instead of number
                return self.handle_event(input, funcargs)
            elif itype== 0:
                console_log("INFO","\n\nDTMF input: %s\n" % input)
            else:
                console_log("INFO","\n\nUnknown input type: %s\n" % itype)
            return None 
        num_tries = 0
        session = self.speech_detect.session
        console_log("debug", "setting dtmf callback\n")
        session.setDTMFCallback(dtmf_handler, "")
        console_log("debug", "calling getDigits\n")
        console_log("debug", "starting run() while loop\n")        
        while (session.ready() and 
               num_tries < self.max_tries and
               len(self.detected_phrases) < self.required_phrases):
            console_log("debug", "top of run() while loop\n")        
            session.collectDigits(self.wait_time)
            num_tries += 1
        console_log("debug", "while loop finished\n")
        return self.detected_phrases
    def handle_event(self, event, funcargs):
        """
        when the dtmf handler receives an event, it calls back
        this method.  event is a dictionary with subdictionaries ..
        Example 1
        =========
        {'body': None, 'headers': {'Speech-Type': 'begin-speaking'}}
        Example 2
        =========
        {'body': '<result xmlns='http://www.ietf.org/xml/ns/mrcpv2'
        xmlns:ex='http://www.example.com/example' score='100'
        grammar='session:request1@form-level.store'><interpretation>
        <input mode='speech'>waffles</input></interpretation></result>',
        'headers': {'Speech-Type': 'detected-speech'}}
        This dictionary is constructed in run_dtmf_callback() in
        freeswitch_python.cpp
        """
        # what kind of event?
        headers = event['headers']
        speech_type = headers['Speech-Type']
        if speech_type == "begin-speaking":
            # not sure what to do with this, try returning "stop"
            # so that it might stop playing a sound file once
            # speech has been detected 
            return "stop"
        elif speech_type == "detected-speech":
            # extract the detected phrase. from result
            # BUG: this assumes only ONE interpretation in the xml
            # result.  rest will get igored
            # NOTE: have to wrap everything with str() (at least
            # calls to console_log because otherwise it chokes on
            # unicode strings.
            # TODO: check the score
            body = event['body']
            dom = minidom.parseString(body)
            phrase = dom.getElementsByTagName(self.grammar.obj_path)[0]
            phrase_text = self.getText(phrase)
            if phrase_text:
                self.detected_phrases.append(str(phrase_text))
                # do we want to return stop?  what should we return?
                return "stop"  
        else:
            raise Exception("Unknown speech event: %s" % speech_type)
    def getText(self, elt):
        """ given an element, get its text.  if there is more than
        one text node child, just append all the text together.
        """
        result = ""
        children = elt.childNodes
        for child in children:
            if child.nodeType == child.TEXT_NODE:
                result += str(child.nodeValue)
        return result
--- a/scripts/recipewizard.py
+++ b/scripts/recipewizard.py
@ -0,0 +1,87 @@
 from freeswitch import *
 from py_modules.speechtools import Grammar, SpeechDetect
 from py_modules.speechtools import SpeechObtainer
 import time, os
 VOICE_ENGINE = "cepstral"
 VOICE = "William"
 GRAMMAR_ROOT = "/usr/src/freeswitch_trunk/scripts"
 """
 Example speech recognition application in python.  
 How to make this work:
 * Get mod_openmrcp working along with an MRCP asr server
 * Add /usr/src/freeswitch/scripts or equivalent to your PYTHONPATH
 * Restart freeswitch
 * Create $GRAMMAR_ROOT/mainmenu.xml from contents in mainmenu() comments
 """
 class RecipeWizard:
    def __init__(self, session):
        self.session=session
        self.session.set_tts_parms(VOICE_ENGINE, VOICE)        
        self.main()
    def main(self):
        console_log("debug", "recipe wizard main()\n")        
        self.speechdetect = SpeechDetect(self.session, "openmrcp", "127.0.0.1");
        self.speechobtainer = SpeechObtainer(speech_detect=self.speechdetect,
                                             required_phrases=1,
                                             wait_time=5000,
                                             max_tries=3)
        gfile = os.path.join(GRAMMAR_ROOT, "mainmenu.xml")
        self.grammar = Grammar("mainmenu", gfile,"input",80,90)
        self.speechobtainer.setGrammar(self.grammar);
        console_log("debug", "calling speechobtainer.run()\n")
        self.speechobtainer.detectSpeech()
        self.session.speak("Hello. Welcome to the recipe wizard. Drinks or food?")
        result = self.speechobtainer.run()
        console_log("debug", "speechobtainer.run() result: %s\n" % result)
        if result:
            self.session.speak("Received result.  Result is: %s" % result[0])
        else:
            self.session.speak("Sorry, I did not hear you")
        console_log("debug", "speechobtainer.run() finished\n")        
 def mainmenu():
    """
    <!DOCTYPE grammar PUBLIC "-//W3C//DTD GRAMMAR 1.0//EN"
             "http://www.w3.org/TR/speech-grammar/grammar.dtd">
    <grammar xmlns="http://www.w3.org/2001/06/grammar" xml:lang="en"
      xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
      xsi:schemaLocation="http://www.w3.org/2001/06/grammar
                      http://www.w3.org/TR/speech-grammar/grammar.xsd"
      version="1.0" mode="voice" root="root">
    <rule id="root" scope="public">
        <rule id="main">
          <one-of>
         <item weight="10">drinks</item>
         <item weight="2">food</item>
          </one-of>
        </rule>
    </rule>
    </grammar>
    """
    pass
 def handler(uuid):
    session = PySession(uuid)
    session.answer()
    rw = RecipeWizard(session)
    session.hangup("1")