Realtime Communication
This module implements a realtime communication with audio beetween the robot and the user.
import asyncio
import logging
import wave
import numpy as np
import sys
from go2_webrtc_driver.webrtc_driver import Go2WebRTCConnection, WebRTCConnectionMethod
import os
import time
import wave
import pyaudio
import json
import edge_tts
import re
import numpy as np
import pygame
from groq import Groq
import cv2
import mediapipe as mp
import numpy as np
import sys
import time
from unitree_sdk2py.core.channel import ChannelFactoryInitialize
from unitree_sdk2py.core.channel import ChannelSubscriber
from unitree_sdk2py.idl.default import unitree_go_msg_dds__SportModeState_
from unitree_sdk2py.idl.unitree_go.msg.dds_ import SportModeState_
from unitree_sdk2py.go2.vui.vui_client import VuiClient
from unitree_sdk2py.go2.sport.sport_client import (
SportClient,
PathPoint,
SPORT_PATH_POINT_SIZE,
)
from aiortc.contrib.media import MediaPlayer
wf = None
# constants
SILENCE_THRESHOLD = 200
SILENCE_RMS_THRESHOLD = 800 # Ambient noise level threshold
AUDIO_FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
BUFFER_SIZE = 1024
TIMEOUT = 40
KEYWORD = "spark" # Keyword to listen for
FILENAME = "audio.wav"
# Enable logging for debugging
logging.basicConfig(level=logging.FATAL)
# Define the audio properties
samplerate = 48000 # Sample rate for WebRTC audio
channels = 2 # Stereo audio
filename = "output.wav"
record_duration = 5 # Record for 5 seconds
total_frames_to_record = record_duration * samplerate # Total frames for the specified duration
frames_recorded = 0 # Counter for the number of frames recorded
done_writing_to_file = False # Flag to indicate when writing is done
# Function to handle receiving audio frames and write them directly to file
async def recv_audio_stream(frame):
global frames_recorded, done_writing_to_file, wf
if done_writing_to_file:
return
# Convert the frame to audio data (assuming 16-bit PCM)
audio_data = np.frombuffer(frame.to_ndarray(), dtype=np.int16)
# Write the audio data directly to the WAV file
wf.writeframes(audio_data.tobytes())
# Update the frame counter
frames_recorded += len(audio_data) // channels
# If we've recorded enough frames, stop further recording
if frames_recorded >= total_frames_to_record:
# Close the WAV file
wf.close()
print(f"Audio recording complete, saved to {filename}")
done_writing_to_file = True
def speak_text(text):
"""Generate the text, change to your language"""
communicate = edge_tts.Communicate(text, voice='pt-BR-FranciscaNeural', rate='-10%')
communicate.save_sync("output.mp3")
def save_audio_to_file(audio_data, filename):
try:
with wave.open(filename, 'wb') as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(pyaudio.PyAudio().get_sample_size(AUDIO_FORMAT))
wf.setframerate(RATE)
wf.writeframes(audio_data)
except Exception as e:
print(f"Erro ao salvar o áudio: {e}")
def generate_completion(client, text):
completion = client.chat.completions.create(
model="llama-3.1-70b-versatile",
messages=[
{
"role": "system",
"content": "You are a robotic dog named Spark. Your task is to help users with their questions about you. Do not introduce yourself every time; only do so if asked. You were built and created by Kair, Ytalo, Eduardo, Andre. You can only communicate and answer in the same language of the user."
},
{
"role": "user",
"content": f"{text}"
}
],
temperature=.9,
max_tokens=1024,
top_p=.7,
stream=True,
stop=None,
)
response = ""
for chunk in completion:
response += chunk.choices[0].delta.content or ""
return response
def record_audio():
try:
p = pyaudio.PyAudio()
stream = p.open(format=AUDIO_FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=BUFFER_SIZE)
print("Gravando áudio...")
frames = []
silent_frames = 0
while silent_frames < SILENCE_THRESHOLD:
data = stream.read(BUFFER_SIZE)
frames.append(data)
# Calcular RMS para detectar silêncio
rms = np.sqrt(np.mean(np.square(np.frombuffer(data, dtype=np.int16))))
if rms < SILENCE_RMS_THRESHOLD:
silent_frames += 1
else:
silent_frames = 0
print("Gravação concluída.")
stream.stop_stream()
stream.close()
p.terminate()
return b''.join(frames)
except Exception as e:
print(f"Ocorreu um erro durante a gravação de áudio: {e}")
return None
def save_audio_to_file(audio_data, filename):
try:
with wave.open(filename, 'wb') as wf:
wf.setnchannels(CHANNELS)
wf.setsampwidth(pyaudio.PyAudio().get_sample_size(AUDIO_FORMAT))
wf.setframerate(RATE)
wf.writeframes(audio_data)
except Exception as e:
print(f"Erro ao salvar o áudio: {e}")
async def listen_for_keyword(client, conn):
last_interaction_time = time.time()
conversation_active = False
print(f"Listening for '{KEYWORD}'...")
while True:
try:
if time.time() - last_interaction_time > TIMEOUT and not conversation_active:
print("No interaction. Waiting for keyword...")
audio_data = record_audio()
if audio_data is None:
continue
save_audio_to_file(audio_data, FILENAME)
with open(FILENAME, "rb") as file:
transcription = client.audio.transcriptions.create(
file=(FILENAME, file.read()),
model="whisper-large-v3-turbo",
response_format="verbose_json",
)
command = transcription.text.lower()
if True:
conversation_active = True
last_interaction_time = time.time()
# Gerar resposta do chatbot
response = generate_completion(client, command).replace("\"", "").replace("'", "\"")
print("Resposta bruta:", response)
speak_text(response)
mp3_path = "output.mp3"
logging.info(f"Playing MP3: {mp3_path}")
player = MediaPlayer(mp3_path) # Use MediaPlayer for MP3
audio_track = player.audio # Get the audio track from the player
conn.pc.addTrack(audio_track)
# Processar JSON
await asyncio.sleep(4)
except Exception as e:
print(f"Ocorreu um erro: {e}")
# Main function for setting up the WebRTC connection and handling streams
async def main():
global wf
try:
client = Groq(api_key="YOUR_API_KEY")
conn = Go2WebRTCConnection(WebRTCConnectionMethod.LocalAP)
# Connect to the device
await conn.connect()
await listen_for_keyword(client, conn)
except ValueError as e:
logging.error(f"Error in WebRTC connection: {e}")
if __name__ == "__main__":
try:
asyncio.run(main())
except KeyboardInterrupt:
# Handle Ctrl+C to exit gracefully.
print("\nProgram interrupted by user")
sys.exit(0)