glue/src/stt.py at main · microsoft/glue · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
''' SPEECH-TO-TEXT USING MICROSOFT SPEECH API '''
''' nonstoptimm@gmail.com '''

# Import required packages
import os
import glob
import json
import logging
import codecs
import helper as he
import azure.cognitiveservices.speech as speechsdk
import params as pa

# Load and set configuration parameters
pa.get_config()

def request_endpoint(audio, speech_config, output_directory, lexical):
    """Request the speech service endpoint
    Args:
        audio: Input data frame
        speech_config: Choice between scoring and
        output_folder: LUIS app ID
        case: LUIS subscription key
        lexical: Minimum confidence score for LUIS result, between 0.00 and 1.00
    Returns:
        df: Scoring data frame with predicted intents and scores
    Raises:
        ConnectionError: If file is not found
    """
    audio_config = speechsdk.audio.AudioConfig(filename = audio)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config = speech_config, audio_config = audio_config)
    result = speech_recognizer.recognize_once()
    filename = audio[audio.rindex('\\')+1:]
    text = process_recognition(result, filename, output_directory, lexical)
    return text, filename

def process_recognition(result, filename, output_directory, lexical):
    """Process recognition received from the speech service
    Args:
        result: Result object returned by STT-service
        filename: Filename for output file
        output_directory: Output directory for the file
        lexical: Boolean to enable extended lexical version of STT-result
    Returns:
        text: Processed recognition as string
    """
    if result.reason == speechsdk.ResultReason.RecognizedSpeech:
        if lexical:
            text = f"{format(result.text)}\t{json.loads(result.json)['NBest'][0]['Lexical']}"
        else:
            text = f"{format(result.text)}"
        logging.info(f"[INFO] - Recognition successful: {filename} -> {result.text}")
    elif result.reason == speechsdk.ResultReason.NoMatch:
        logging.warning(filename + "\t" + f"No speech could be recognized: {result.no_match_details}")
        text = ""
    elif result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = result.cancellation_details
        logging.error(filename+"\t"+ f"Speech Recognition canceled: {cancellation_details.reason}")
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            logging.error(f"Error details: {cancellation_details.error_details}")
        text = ""
    return text

# General Function
def write_transcription(output_directory, text):
    """Write transcription to file
    Args:
        text: Processed recognition as string
        output_directory: Output directory for the file
    Returns:
        Writes output to file
    """
    if not os.path.exists(f'{output_directory}/transcriptions.txt'):
        transfile = codecs.open(f'{output_directory}/transcriptions.txt', 'w', encoding='utf-8-sig')
        transfile.close()
        logging.warning(f'[INFO] - Created transcript file with utf-8 bom encoding.')
    with open(f"{output_directory}/transcriptions.txt", "a", encoding='utf-8-sig') as transfile:
        transfile.write(f'{text}\n')
        transfile.close()

def main(speech_files, output_directory, lexical = False, enable_proxy = False, *argv):
    """Main function for STT-functionality
    Args:
        speech_files: Directory of audio files to be transcribed
        output_directory: Output directory for the file
        lexical: Boolean to enable extended lexical version of STT-result
        enable_proxy: Boolean to enable proxy function in case you need it
        *argv: Proxy information if enable_proxy is True -> hostname: str, port: str, username: str, password: str
    Returns:
        zip(filenames, results): Zipped lists of filenames and STT-results as string
    """
    try:
        speech_config = speechsdk.SpeechConfig(subscription = pa.config_data['stt_key'], region = pa.config_data['stt_region'])
    except RuntimeError:
        logging.error("[ERROR] - Could not retrieve speech config")
    # If necessary, you can enable a proxy here:
    # set_proxy(hostname: str, port: str, username: str, password: str)
    if enable_proxy:
        speech_config.set_proxy(argv[0], argv[1], argv[2], argv[3])
    # Set speech service properties, requesting the detailed response format to make it compatible with lexical format, if wanted
    speech_config.set_service_property(name='format', value='detailed', channel=speechsdk.ServicePropertyChannel.UriQueryParameter)
    if pa.config_data['stt_endpoint'] != "":
        speech_config.endpoint_id = pa.config_data['stt_endpoint']
    logging.info(f'[INFO] - Starting to transcribe {len(next(os.walk(speech_files))[2])} audio files')
    results = []
    filenames = []
    for audio in glob.iglob(f'{speech_files}*av'):
        result, filename = request_endpoint(audio, speech_config, output_directory, lexical)
        results.append(result)
        filenames.append(filename)
    # Check the result
    return zip(filenames, results)

if __name__ == '__main__':
    main("input/audio/", "output/test/")