Whisper Skill implementation

aivanouski

Indonesia

Asked Mar 2024

I wanna create whisper from openai speech recognition skill, it looks like this now:

Quote:

using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.IO.Compression; using System.IO; using System.Linq; using System.Text; using System.Threading.Tasks; using System.Windows.Forms; using ARC; using NAudio; using NAudio.Wave; using ARC.Scripting; using ARC.Scripting.Python.Helpers; using System.Reflection; using System.Diagnostics;

namespace test2 {

public partial class MainForm : ARC.UCForms.FormPluginMaster
{

    Configuration _config;

    public MainForm()
    {
        InitializeComponent();
        ConfigButton = true;
        RecordAudio();
    }

    private void RecordAudio()
    {
        using (WaveInEvent waveIn = new WaveInEvent())
        {
            waveIn.WaveFormat = new WaveFormat(44100, 1); // Set the sample rate and number of channels
            waveIn.DataAvailable += WaveIn_DataAvailable; // Event handler for data available
            waveIn.StartRecording();

            // Wait for user input to stop recording
            ARC.LogManager.Log("Recording... Press any key to stop.");

            //waveIn.StopRecording();
        }
    }

    private void WaveIn_DataAvailable(object sender, WaveInEventArgs e)
    {
        // Process the audio data and send it to the API
        byte[] audioData = e.Buffer;

        // Send the audio data to the API
        // TODO: Implement API integration code here

        //System.Console.WriteLine(audioData);
        ARC.LogManager.Log(audioData.Length.ToString());
    }

    public override object[] GetSupportedControlCommands()
    {

        List items = new List();

        items.Add(Configuration.StartPlayingAudio);
        items.Add(Configuration.StopPlayingAudio);

        return items.ToArray();
    }

    public override void SendCommand(string windowCommand, params string[] values)
    {
        EZBManager.Log("command values", values);

        if (windowCommand.Equals(Configuration.StartPlayingAudio, StringComparison.InvariantCultureIgnoreCase))
            playAudio(values[0]);
        else if (windowCommand.Equals(Configuration.StopPlayingAudio, StringComparison.InvariantCultureIgnoreCase))
            stopPlaying();
        else
            base.SendCommand(windowCommand, values);
    }

    void stopPlaying()
    {
        EZBManager.EZBs[0].SoundV4.Stop();
    }

    void playAudio(string filePath)
    {
        try
        {
            using (NAudio.Wave.Mp3FileReader mp3 = new NAudio.Wave.Mp3FileReader(filePath))
            {

                using (NAudio.Wave.WaveFormatConversionStream pcm = new NAudio.Wave.WaveFormatConversionStream(new NAudio.Wave.WaveFormat(EZ_B.EZBv4Sound.AUDIO_SAMPLE_BITRATE, 8, 1), mp3))
                {
                    using (MemoryStream ms = new MemoryStream())
                    {
                        pcm.CopyTo(ms);
                        EZBManager.EZBs[0].SoundV4.PlayData(ms.ToArray());
                    }
                }
            }
        }
        catch (Exception ex)
        {

            EZBManager.Log("Error playing audio. {0}", ex.Message);
        }
    }

    /// 
    /// Set the configuration from the project file when loaded.
    /// We'll extract the _config class that's from the project file.
    /// 
    /// 
    public override void SetConfiguration(ARC.Config.Sub.PluginV1 cf)
    {

        _config = (Configuration)cf.GetCustomObjectV2(typeof(Configuration));

        base.SetConfiguration(cf);
    }

    /// 
    /// When the project is saving, give it a copy of our config
    /// 
    public override ARC.Config.Sub.PluginV1 GetConfiguration()
    {

        _cf.SetCustomObjectV2(_config);

        return base.GetConfiguration();
    }

    /// 
    /// The user pressed the config button in the title bar. Show the config menu and handle the changes to the config.
    /// 
    public override void ConfigPressed()
    {

        using (var form = new ConfigForm())
        {

            form.SetConfiguration(_config);

            if (form.ShowDialog() != DialogResult.OK)
                return;

            _config = form.GetConfiguration();
        }
    }
}

}

But I catch error:

Quote:
The type or namespace name 'WaveInEventArgs' could not be found (are you missing a using directive or an assembly reference?

Looks like we have only NAudio.dll, but also need NAudio.Wave.dll

How to record audio from microphone?

Jump to end

Upgrade to ARC Pro

Unlock the true power of automation and robotics by becoming a proud subscriber of Synthiam ARC Pro.

Compare Pro Features View Subscription Plans

Athena AI

PRO

Canada

#1 Mar 2024

To implement a whisper feature using OpenAI's speech recognition in conjunction with the EZ-InMoov Robot Head, you'll need to approach this in a few steps. Since you're encountering an error related to NAudio.dll and possibly missing NAudio.Wave.dll, let's address that first before moving on to recording audio from the microphone.

Fixing the NAudio.dll Error

Ensure NAudio is Installed: The error you're encountering suggests that there might be a missing dependency, specifically NAudio.Wave.dll. NAudio is a popular .NET library for audio processing, which ARC might use for audio-related functionalities.
Check ARC's Library: First, ensure that ARC (Synthiam's software) includes the necessary libraries for audio processing. If ARC is supposed to provide these libraries, you might need to update ARC to the latest version or reinstall it to ensure all dependencies are correctly included.
Manually Add NAudio.Wave.dll: If ARC doesn't automatically include NAudio.Wave.dll, you might need to manually add it to your project. You can download NAudio from NuGet or GitHub. After downloading, place the NAudio.Wave.dll in the same directory as your project or in a directory that's included in your project's library path.

Recording Audio from the Microphone

To record audio from the microphone using NAudio in a .NET environment, you can follow this basic approach. Note that this is a simplified example to get you started:

using NAudio.Wave;
using System;

class AudioRecorder
{
    private WaveIn waveSource = null;
    private WaveFileWriter waveFile = null;

    public void StartRecording()
    {
        waveSource = new WaveIn();
        waveSource.WaveFormat = new WaveFormat(44100, 1); // Sample rate and channels

        waveSource.DataAvailable += new EventHandler<WaveInEventArgs>(waveSource_DataAvailable);
        waveSource.RecordingStopped += new EventHandler<StoppedEventArgs>(waveSource_RecordingStopped);

        waveFile = new WaveFileWriter("YourAudioFile.wav", waveSource.WaveFormat);

        waveSource.StartRecording();
    }

    private void waveSource_DataAvailable(object sender, WaveInEventArgs e)
    {
        if (waveFile != null)
        {
            waveFile.Write(e.Buffer, 0, e.BytesRecorded);
            waveFile.Flush();
        }
    }

    private void waveSource_RecordingStopped(object sender, StoppedEventArgs e)
    {
        if (waveSource != null)
        {
            waveSource.Dispose();
            waveSource = null;
        }

        if (waveFile != null)
        {
            waveFile.Dispose();
            waveFile = null;
        }
    }
}

Integrating with OpenAI's Speech Recognition

After recording the audio, you'll need to send the audio file to OpenAI's speech recognition API. This step depends on the specific API you're using (e.g., Whisper) and will likely involve making an HTTP request with the audio file attached. You'll need to consult OpenAI's API documentation for the exact details on how to make this request.

Final Thoughts

Ensure you have the correct permissions to access the microphone.
Test the audio recording independently before integrating it with OpenAI's API to ensure everything works as expected.
If ARC provides built-in support for audio recording or OpenAI integration, consider using those features to simplify your implementation.

Remember, the code snippets provided are meant to serve as a starting point. You'll need to adapt them to fit the specifics of your project and the APIs you're working with.

I am a customer support GPT-powered AI and still learning. If you have doubts about my response, please use the support documentation to verify my answer.

aivanouski

Indonesia

#2 Mar 2024

still

MainForm.cs(38,70,38,85): error CS0234: The type or namespace name 'WaveInEventArgs' does not exist in the namespace 'NAudio.Wave' (are you missing an assembly reference?)

aivanouski

Indonesia

#3 Mar 2024

@Athena how to log string to Debug log?

Athena AI

PRO

Canada

#4 Mar 2024

To log a string to the Debug log in Synthiam ARC, you can use the Debug() function available in EZ-Script, which is one of the scripting languages supported by ARC. This function allows you to output text to the debug console within ARC, making it useful for debugging purposes or for tracking the flow of your script.

Here's how you can use the Debug() function in EZ-Script:

Debug("This is a debug message")

This line of code will output the string "This is a debug message" to the debug console in ARC.

If you're using JavaScript or Python within ARC, the approach to logging debug information is slightly different. For JavaScript, you can use the console.log() function, and for Python, you can use the print() function. However, keep in mind that the visibility of these messages might depend on how ARC handles standard output for these scripting environments.

For JavaScript in ARC:

console.log("This is a debug message");

For Python in ARC:

print("This is a debug message")

Remember, the primary method for logging to the debug console in ARC is through EZ-Script using the Debug() function. This is the most straightforward way to log messages for debugging purposes directly within the ARC environment.

I am a customer support GPT-powered AI and still learning. If you have doubts about my response, please use the support documentation to verify my answer.

Dave Schulpius

PRO

USA

#5 Mar 2024 — Edited Mar 2024

Nevermind. LOL. I was wrong.

DJ Sures

PRO

Synthiam

#6 Mar 2024

don't use any 3rd party versions of naudio. Use the naudio.dll provided in the ARC installation folder
here's how you can record audio

create global variables for the audio stuff u'll need

WaveInEvent                     _waveIn; // recording device
    MemoryStream                    _ms; // buffers the recorded audio
    WaveFileWriter                  _waveWriter;
    WaveStream                      _waveReader;

Initialize the wavein device when the robot skill form is loaded event

    private void FormMain_Load(object sender, EventArgs e) {

        _waveIn = new WaveInEvent();
        _waveIn.WaveFormat = new WaveFormat(_SAMPLE_RATE, 16, 1);
        _waveIn.DataAvailable += _waveIn_DataAvailable;
        _waveIn.BufferMilliseconds = 250;
        _waveIn.StartRecording();
}

You need some button or method to the recording process...

    void startRecording() {

      if (_waveWriter != null) {

        Invokers.SetAppendText(tbLog, true, "Already recording...");

        return;
      }

      Invokers.SetText(btnStartStopRecording, "Stop Recording");

      _ms = new MemoryStream();

      _waveWriter = new NAudio.Wave.WaveFileWriter(_ms, new NAudio.Wave.WaveFormat(_SAMPLE_RATE, 1));
}

the dataAvailable event from the WaveIn device needs to write teh recorded data to a stream. That's why the memory stream above is created...

    private void _waveIn_DataAvailable(object sender, NAudio.Wave.WaveInEventArgs e) {

      try {

        if (IsClosing)
          return;

// only record if we're recording
        if (_waveWriter != null)
          _waveWriter.Write(e.Buffer, 0, e.BytesRecorded);

// Add a max recording length
        if (_sw.Elapsed.TotalSeconds >= Convert.toInt32(_cf.Storage.GetConfiguration("MaxRecordingLength")))
          stopRecording();
      } catch (Exception ex) {

        Invokers.SetAppendText(tbLog, true, "Error in DataAvailable: {0}", ex.Message);
      }
    }

Add event to send and process the audio when the recording is STOPPED

    void stopRecording() {

      if (_waveWriter == null)
        return;

      try {

// change the button to not be enabled
        Invokers.SetEnabled(btnStartStopRecording, false);

// change the button text so we know what's up
        Invokers.SetText(btnStartStopRecording, "Sending...");

// Add some text the LOG textbox named tbLog
        Invokers.SetAppendText(tbLog, true, "Sending data ({0} Bytes)", _ms.Length);

SEND YOUR DATA FROM THE _ms.ToArray() or _ms or what ever your API needs
} catch (Exception ex) {

        Invokers.SetText(btnStartStopRecording, "Start Recording");

        Invokers.SetEnabled(btnStartStopRecording, true);

        Invokers.SetAppendText(tbLog, true, ex.Message);

        _isWebServiceActive = false;
      } finally {

        Invokers.SetAppendText(tbLog, true, "Data Sent!");

        _waveWriter?.Close();
        _waveWriter?.Dispose();
        _waveWriter = null;

        _ms?.Close();
        _ms?.Dispose();
        _ms = null;
      }
    }

Don't forget that you need to dispose of everything when the form is closed...

    private void FormMain_FormClosing(object sender, System.Windows.Forms.FormClosingEventArgs e) {

      _waveReader?.Dispose();

      _waveOutDevice?.Dispose();

      _ws.Dispose();

      if (_waveIn != null)
        try {

          _waveIn.StopRecording();
          _waveIn.Dispose();
        } catch {
        }

      if (_waveWriter != null)
        try {

          _waveWriter.Close();
          _waveWriter.Dispose();
        } catch {
        }

      if (_ms != null)
        try {

          _ms.Close();
          _ms.Dispose();
        } catch {
        }

DJ Sures

PRO

Synthiam

#7 Mar 2024

Also, don't foget you can add a voice activity detector to know when someone is speaking

Add the global variable

    VAD.VoiceActivityDetector       _vad;

Create the object in the Form Load event

    private void FormMain_Load(object sender, EventArgs e) {

        _vad = new VAD.VoiceActivityDetector(_SAMPLE_RATE);
        _vad.Enabled = true;
        _vad.SpeechBegin += _vad_SpeechBegin;
        _vad.SpeechEnd += _vad_SpeechEnd;
        _vad.AudioLevelChange += _vad_AudioLevelChange;
}

In the WAVE IN DataAvailable event, feed the VAD to know when someone is speaking...

    private void _waveIn_DataAvailable(object sender, NAudio.Wave.WaveInEventArgs e) {

      try {

        _vad.ProcessBufferEx(e.Buffer, e.BytesRecorded);

        if (_waveWriter != null)
          _waveWriter.Write(e.Buffer, 0, e.BytesRecorded);

        if (_sw.Elapsed.TotalSeconds >= SOME MAX LENGTH)
          stopRecording();
      } catch (Exception ex) {

        Invokers.SetAppendText(tbLog, true, "Error in DataAvailable: {0}", ex.Message);
      }
    }

When VAD detects speech, start recording

    private void _vad_SpeechBegin() {

      try {

// use some way to determine if we should auto record with VAD
        if (AutoRecordingEnabled && _waveWriter == nul) {

          Invokers.SetAppendText(tbLog, true, "VAD Auto Record Start");

          startRecording();
        }
      } catch (Exception ex) {

        Invokers.SetAppendText(tbLog, true, $"vad_SpeechBegin error: {ex.Message}");
      }
    }

Need to stop recording when speech is stopped

    private void _vad_SpeechEnd() {

      if (_cf.BingSpeechRecognition.AutoRecordingEnabled && _waveWriter != null) {

        Invokers.SetAppendText(tbLog, true, "VAD Auto Record Stop");

        stopRecording();
      }
    }

aivanouski