diff --git a/avatar/streaming/.env.sample b/avatar/streaming/.env.sample new file mode 100644 index 00000000..c942cfe5 --- /dev/null +++ b/avatar/streaming/.env.sample @@ -0,0 +1,46 @@ +# Azure Speech - Resource Configuration +SPEECH_REGION = "" +SPEECH_KEY = "" + +# Azure Speech - Text to Speech Avatar Configuration +AVATAR_CHARACTER = "" +AVATAR_STYLE = "" +IS_CUSTOM_AVATAR = "" +AZURE_OPENAI_SYSTEM_PROMPT = "You are an AI language model integrated with a Text-to-Speech system. +Please provide all responses in plain text without any markdown formatting or special symbols like #, *, _. +Avoid using headings, bullet points, or any other markdown syntax. +Your responses should be suitable for direct verbal communication. +Your name is Lisa, an AI assistant that helps people find information. +Always provide responses that are concise and conversational, strictly limiting responses to 50 words (in one paragraph) and suitable for verbal delivery within 15 seconds. +If a response would exceed this limit, summarize the key points to fit within it and steer the user to ask more details instead. +If a user's question falls outside available data or context, respond with, 'I can't help with that specific query' or similar message. +Again, make sure that the responses are concisely and accurately summarized under 50 words." + +# Azure Speech - Text to Speech Voice Configuration +TTS_VOICE = "" +# CUSTOM_VOICE_ENDPOINT="" +# PERSONAL_VOICE_SPEAKER_PROFILE="" + +# Azure OpenAI - Resource Configuration +AZURE_OPENAI_ENDPOINT = "" +AZURE_OPENAI_API_KEY = "" +AZURE_OPENAI_DEPLOYMENT_NAME = "" + +# Azure Search - Resource Configuration (optional, only required for 'on your data' scenario) +COGNITIVE_SEARCH_ENDPOINT = "" +COGNITIVE_SEARCH_API_KEY = "" +COGNITIVE_SEARCH_INDEX_NAME = "" + +# CSS Variables (Landscape) +WEBPAGE_BACKGROUND_LANDSCAPE = "https://github.com/aadrikasingh/Azure-Text-To-Speech-Avatar/blob/main/static/image/landscape.png?raw=true" +WEBPAGE_CHAT_FONTCOLOR_LANDSCAPE = "#EEE" +BUTTON_COLOR_LANDSCAPE = "#3E66BA" +BUTTON_HOVER_LANDSCAPE = "#28a745" +BUTTON_ICON_COLOR_LANDSCAPE = "#FFF" + +# CSS Variables (Portrait) +WEBPAGE_BACKGROUND_PORTRAIT = "https://github.com/aadrikasingh/Azure-Text-To-Speech-Avatar/blob/main/static/image/portrait.png?raw=true" +WEBPAGE_CHAT_FONTCOLOR_PORTRAIT = "#EEE" +BUTTON_COLOR_PORTRAIT ="#3E66BA" +BUTTON_HOVER_PORTRAIT = "#28a745" +BUTTON_ICON_COLOR_PORTRAIT = "#FFF" \ No newline at end of file diff --git a/avatar/streaming/.gitignore b/avatar/streaming/.gitignore new file mode 100644 index 00000000..4ea03f07 --- /dev/null +++ b/avatar/streaming/.gitignore @@ -0,0 +1,3 @@ +/.venv +/__pycache__ +.env \ No newline at end of file diff --git a/avatar/streaming/README.md b/avatar/streaming/README.md new file mode 100644 index 00000000..642cec12 --- /dev/null +++ b/avatar/streaming/README.md @@ -0,0 +1,159 @@ +# Transforming Digital Interactions with Hyper-Realistic Custom Avatars and Custom Neural Voices + +This innovative solution combines Azure Text-to-Speech Custom Avatar Real-time API service and Custom Neural Voices to deliver hyper-realistic avatars with lifelike expressions and movements. Paired with advanced AI capabilities, these avatars enable seamless, human-like interactions tailored to diverse applications, from customer support to educational tools. By leveraging Retrieval Augmented Generation (RAG) using Azure OpenAI and Azure AI Search, the system ensures precise, contextually aware responses, redefining the way we engage and communicate in the digital age. + +--- + +## Pre-requisites + +Ensure the following Azure services are deployed before running this project: + +1. **Azure Speech Service**: + - For Text-to-Speech (TTS) and Speech-to-Text (STT) functionalities. +2. **Azure OpenAI Service**: + - For natural language response generation using GPT models. +3. **Azure AI Search Service**: _(optional, if using your own data)_ + - For contextual data retrieval using the "Bring Your Own Data" feature of Azure OpenAI. + - You can follow the instructions [here](https://learn.microsoft.com/en-us/azure/ai-services/openai/use-your-data-quickstart?tabs=command-line%2Cjavascript-keyless%2Ctypescript-keyless%2Cpython-new&pivots=programming-language-studio). +4. **Azure Storage Account**: _(optional, if using your own data)_ + - To store customer-provided data for the search service. + +--- + +## Setup Instructions + +### Step 1: Clone the Repository +Clone the repository to your local environment: + +``` +git clone https://github.com/aadrikasingh/Azure-Text-To-Speech-Avatar.git +cd Azure-Text-To-Speech-Avatar +``` + +### Step 2: Install Dependencies +Install required Python packages using: + +``` +pip install -r requirements.txt +``` + +### Step 3: Configure Environment Variables +Create a `.env` file in the project root and set the following environment variables: + +**(Please ensure you begin with the `.env.sample` template)** + +#### Azure Speech Configuration +``` +SPEECH_REGION = "" +SPEECH_KEY = "" +``` + +#### Avatar Configuration +``` +AVATAR_CHARACTER="" +AVATAR_STYLE="" +IS_CUSTOM_AVATAR="" +``` + +#### Neural Voice Configuration +``` +TTS_VOICE="" +CUSTOM_VOICE_ENDPOINT="" +PERSONAL_VOICE_SPEAKER_PROFILE="" +``` + +#### Azure OpenAI Configuration +``` +AZURE_OPENAI_ENDPOINT="" +AZURE_OPENAI_API_KEY="" +AZURE_OPENAI_DEPLOYMENT_NAME="" +AZURE_OPENAI_SYSTEM_PROMPT=" +``` + +#### Azure AI Search Configuration (Optional) +``` +COGNITIVE_SEARCH_ENDPOINT="" +COGNITIVE_SEARCH_API_KEY="" +COGNITIVE_SEARCH_INDEX_NAME="" +``` + +#### Webpage Customization +For customizing the UI: +``` +WEBPAGE_BACKGROUND_LANDSCAPE="" +WEBPAGE_CHAT_FONTCOLOR_LANDSCAPE="#EEE" +BUTTON_COLOR_LANDSCAPE="#3E66BA" +BUTTON_HOVER_LANDSCAPE="#28a745" + +WEBPAGE_BACKGROUND_PORTRAIT="" +WEBPAGE_CHAT_FONTCOLOR_PORTRAIT="#EEE" +BUTTON_COLOR_PORTRAIT="#3E66BA" +BUTTON_HOVER_PORTRAIT="#28a745" +``` + +#### Set the welcome message +Please change line 267 & 268 in static/js/chat.js file + +--- + +## Running the Application + +1. **Start the Flask Application**: + + Run the following command to launch the web app: + ``` + python -m flask run -h 0.0.0.0 -p 5000 + ``` + +2. **Access the Web Interface (Landscape Orientation)**: + + Open your browser and navigate to: + ``` + http://localhost:5000/chat + ``` + +3. **Access the Web Interface (Portrait Orientation)**: + + Open your browser and navigate to: + ``` + http://localhost:5000/portrait + ``` + +4. **Initialize the Avatar Session**: + - Click the first button **(Start Avatar Session)** to establish a connection with Azure TTS Avatar services. + - If successful, you will see a live avatar video. + +4. **Interact with the Avatar**: + - Click the second button **(Start Microphone)** to enable speech input (ensure you allow microphone access in your browser). + - Speak or type queries (with the **Chat** button) + - The avatar will respond with synchronized audio and video. + +--- + +## Additional Features + +- **Interrupt Speech**: + Use the **"Stop Speaking"** button to halt the avatar mid-sentence. + +- **Clear Chat History**: + Reset the session by clicking the **"Clear Chat History"** button. + +- **Close Avatar Session**: + End the avatar interaction with the **"Close Avatar Session"** button. + +--- + +## Screenshots + +### Landscape Mode +![Landscape Mode](https://github.com/aadrikasingh/Azure-Text-To-Speech-Avatar/blob/main/assets/landscape.png?raw=true) + +### Portrait Mode +![Portrait Mode](https://github.com/aadrikasingh/Azure-Text-To-Speech-Avatar/blob/main/assets/portrait.png?raw=true) + +--- + +## Adaptation +This implementation is adapted from the sample tutorial code provided by Microsoft. For more details, refer to the [original tutorial](https://github.com/Azure-Samples/cognitive-services-speech-sdk/tree/master/samples/js/browser/avatar). + +--- diff --git a/avatar/streaming/app.py b/avatar/streaming/app.py new file mode 100644 index 00000000..6e2bd785 --- /dev/null +++ b/avatar/streaming/app.py @@ -0,0 +1,541 @@ +# Copyright (c) Microsoft. All rights reserved. +# Licensed under the MIT license. + +import os +import re +import time +import pytz +import uuid +import html +import json +import random +import requests +import datetime +import threading +import traceback +from openai import AzureOpenAI +import azure.cognitiveservices.speech as speechsdk +from azure.identity import DefaultAzureCredential +from flask import Flask, Response, render_template, request + +# Create the Flask app +app = Flask(__name__, template_folder='.') + +# Environment variables +# Speech resource (required) +speech_region = os.environ.get('SPEECH_REGION') +speech_key = os.environ.get('SPEECH_KEY') +speech_private_endpoint = os.environ.get('SPEECH_PRIVATE_ENDPOINT') # (optional) +speech_resource_url = os.environ.get('SPEECH_RESOURCE_URL') # (optional, only used for private endpoint) +user_assigned_managed_identity_client_id = os.environ.get('USER_ASSIGNED_MANAGED_IDENTITY_CLIENT_ID') # (optional, only used for private endpoint and user assigned managed identity) + +# OpenAI resource (required for chat scenario) +azure_openai_endpoint = os.environ.get('AZURE_OPENAI_ENDPOINT') +azure_openai_api_key = os.environ.get('AZURE_OPENAI_API_KEY') +azure_openai_deployment_name = os.environ.get('AZURE_OPENAI_DEPLOYMENT_NAME') +azure_openai_system_prompt = os.environ.get('AZURE_OPENAI_SYSTEM_PROMPT') + +# Cognitive search resource (optional, only required for 'on your data' scenario) +cognitive_search_endpoint = os.environ.get('COGNITIVE_SEARCH_ENDPOINT') +cognitive_search_api_key = os.environ.get('COGNITIVE_SEARCH_API_KEY') +cognitive_search_index_name = os.environ.get('COGNITIVE_SEARCH_INDEX_NAME') + +# Customized ICE server (optional, only required for customized ICE server) +ice_server_url = os.environ.get('ICE_SERVER_URL') # The ICE URL, e.g. turn:x.x.x.x:3478 +ice_server_url_remote = os.environ.get('ICE_SERVER_URL_REMOTE') # The ICE URL for remote side, e.g. turn:x.x.x.x:3478. This is only required when the ICE address for remote side is different from local side. +ice_server_username = os.environ.get('ICE_SERVER_USERNAME') # The ICE username +ice_server_password = os.environ.get('ICE_SERVER_PASSWORD') # The ICE password + +# TTS Avatar Configuration +avatar_character = os.environ.get('AVATAR_CHARACTER') +avatar_style = os.environ.get('AVATAR_STYLE') +is_custom_avatar = os.environ.get('IS_CUSTOM_AVATAR') + +background_color = "#1E1E1EFF" +background_image_url = None +transparent_background = "True" +video_crop = "False" + +# Avatar Voice Configuration +tts_voice = os.environ.get('TTS_VOICE') +custom_voice_endpoint = os.environ.get('CUSTOM_VOICE_ENDPOINT') # optional +personal_voice_speaker_profile_id = os.environ.get('PERSONAL_VOICE_SPEAKER_PROFILE') # optional + +# CSS Variables (Landscape) +webpage_background_landscape = os.environ.get('WEBPAGE_BACKGROUND_LANDSCAPE') +webpage_chat_fontcolor_landscape = os.environ.get('WEBPAGE_CHAT_FONTCOLOR_LANDSCAPE') +button_color_landscape = os.environ.get('BUTTON_COLOR_LANDSCAPE') +button_hover_color_landscape = os.environ.get('BUTTON_HOVER_LANDSCAPE') +button_icon_color_landscape = os.environ.get('BUTTON_ICON_COLOR_LANDSCAPE') + +# CSS Variables (Portrait) +webpage_background_portrait = os.environ.get('WEBPAGE_BACKGROUND_PORTRAIT') +webpage_chat_fontcolor_portrait = os.environ.get('WEBPAGE_CHAT_FONTCOLOR_PORTRAIT') +button_color_portrait = os.environ.get('BUTTON_COLOR_PORTRAIT') +button_hover_color_portrait = os.environ.get('BUTTON_HOVER_PORTRAIT') +button_icon_color_portrait = os.environ.get('BUTTON_ICON_COLOR_PORTRAIT') + +# Constant variables +sentence_level_punctuations = [ '.', '?', '!', ':', ';', '。', '?', '!', ':', ';' ] # Punctuations that indicate the end of a sentence +enable_quick_reply = False # Enable quick reply for certain chat models which take longer time to respond +quick_replies = [ 'Let me take a look.', 'Let me check.', 'One moment, please.' ] # Quick reply reponses +oyd_doc_regex = re.compile(r'\[doc(\d+)\]') # Regex to match the OYD (on-your-data) document reference +symbol_regex = re.compile(r'[#*_`]') + +# Global variables +client_contexts = {} # Client contexts +speech_token = None # Speech token +ice_token = None # ICE token +azure_openai = AzureOpenAI(azure_endpoint=azure_openai_endpoint, api_version='2024-06-01', api_key=azure_openai_api_key) + +# The default route (index.html) +@app.route("/", methods=["GET"]) +def index(): + return render_template("index.html", client_id=initializeClient(), stt_locales="en-US", bg_img=webpage_background_landscape, chat_color=webpage_chat_fontcolor_landscape, button_color=button_color_landscape, button_hover_color=button_hover_color_landscape, button_icon_color=button_icon_color_landscape) + +# The chat route (index.html) +@app.route("/chat", methods=["GET"]) +def chatView(): + return render_template("index.html", client_id=initializeClient(), stt_locales="en-US", bg_img=webpage_background_landscape, chat_color=webpage_chat_fontcolor_landscape, button_color=button_color_landscape, button_hover_color=button_hover_color_landscape, button_icon_color=button_icon_color_landscape) + +@app.route("/portrait", methods=["GET"]) +def portraitView(): + return render_template("portrait.html", client_id=initializeClient(), stt_locales="en-US", bg_img=webpage_background_portrait, chat_color=webpage_chat_fontcolor_portrait, button_color=button_color_portrait, button_hover_color=button_hover_color_portrait, button_icon_color=button_icon_color_portrait) + +# The API route to get the speech token +@app.route("/api/getSpeechToken", methods=["GET"]) +def getSpeechToken() -> Response: + global speech_token + response = Response(speech_token, status=200) + response.headers['SpeechRegion'] = speech_region + if speech_private_endpoint: + response.headers['SpeechPrivateEndpoint'] = speech_private_endpoint + return response + +# The API route to get the ICE token +@app.route("/api/getIceToken", methods=["GET"]) +def getIceToken() -> Response: + # Apply customized ICE server if provided + if ice_server_url and ice_server_username and ice_server_password: + custom_ice_token = json.dumps({ + 'Urls': [ ice_server_url ], + 'Username': ice_server_username, + 'Password': ice_server_password + }) + return Response(custom_ice_token, status=200) + return Response(ice_token, status=200) + +# The API route to connect the TTS avatar +@app.route("/api/connectAvatar", methods=["POST"]) +def connectAvatar() -> Response: + + global client_contexts + client_id = uuid.UUID(request.headers.get('ClientId')) + client_context = client_contexts[client_id] + + # Override default values with client provided values + client_context['azure_openai_deployment_name'] = azure_openai_deployment_name + client_context['cognitive_search_index_name'] = cognitive_search_index_name + client_context['tts_voice'] = tts_voice + client_context['custom_voice_endpoint_id'] = custom_voice_endpoint + client_context['personal_voice_speaker_profile_id'] = personal_voice_speaker_profile_id + + custom_voice_endpoint_id = client_context['custom_voice_endpoint_id'] + + try: + if speech_private_endpoint: + speech_private_endpoint_wss = speech_private_endpoint.replace('https://', 'wss://') + speech_config = speechsdk.SpeechConfig(subscription=speech_key, endpoint=f'{speech_private_endpoint_wss}/tts/cognitiveservices/websocket/v1?enableTalkingAvatar=true') + else: + speech_config = speechsdk.SpeechConfig(subscription=speech_key, endpoint=f'wss://{speech_region}.tts.speech.microsoft.com/cognitiveservices/websocket/v1?enableTalkingAvatar=true') + + if custom_voice_endpoint_id: + speech_config.endpoint_id = custom_voice_endpoint_id + + client_context['speech_synthesizer'] = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=None) + speech_synthesizer = client_context['speech_synthesizer'] + + ice_token_obj = json.loads(ice_token) + # Apply customized ICE server if provided + if ice_server_url and ice_server_username and ice_server_password: + ice_token_obj = { + 'Urls': [ ice_server_url_remote ] if ice_server_url_remote else [ ice_server_url ], + 'Username': ice_server_username, + 'Password': ice_server_password + } + + local_sdp = request.data.decode('utf-8') + avatar_config = { + 'synthesis': { + 'video': { + 'protocol': { + 'name': "WebRTC", + 'webrtcConfig': { + 'clientDescription': local_sdp, + 'iceServers': [{ + 'urls': [ ice_token_obj['Urls'][0] ], + 'username': ice_token_obj['Username'], + 'credential': ice_token_obj['Password'] + }] + }, + }, + 'format':{ + 'crop':{ + 'topLeft':{ + 'x': 600 if video_crop.lower() == 'true' else 0, + 'y': 0 + }, + 'bottomRight':{ + 'x': 1320 if video_crop.lower() == 'true' else 1920, + 'y': 1080 + } + }, + 'bitrate': 1000000 + }, + 'talkingAvatar': { + 'customized': is_custom_avatar.lower() == 'true', + 'character': avatar_character, + 'style': avatar_style, + # #00B140FF - chroma green, #009D57FF - television green, #0C8918FF - digital TV green + 'background': { + 'color': '#00B140FF' if transparent_background.lower() == 'true' else background_color, + 'image': { + 'url': background_image_url + } + } + } + } + } + } + + connection = speechsdk.Connection.from_speech_synthesizer(speech_synthesizer) + connection.set_message_property('speech.config', 'context', json.dumps(avatar_config)) + + speech_sythesis_result = speech_synthesizer.speak_text_async('').get() + print(f'Result id for avatar connection: {speech_sythesis_result.result_id}') + if speech_sythesis_result.reason == speechsdk.ResultReason.Canceled: + cancellation_details = speech_sythesis_result.cancellation_details + print(f"Speech synthesis canceled: {cancellation_details.reason}") + if cancellation_details.reason == speechsdk.CancellationReason.Error: + print(f"Error details: {cancellation_details.error_details}") + raise Exception(cancellation_details.error_details) + turn_start_message = speech_synthesizer.properties.get_property_by_name('SpeechSDKInternal-ExtraTurnStartMessage') + remoteSdp = json.loads(turn_start_message)['webrtc']['connectionString'] + + return Response(remoteSdp, status=200) + + except Exception as e: + traceback.print_exc() + return Response(f"Result ID: {speech_sythesis_result.result_id}. Error message: {e}", status=400) + +# The API route to speak a given SSML +@app.route("/api/speak", methods=["POST"]) +def speak() -> Response: + client_id = uuid.UUID(request.headers.get('ClientId')) + try: + ssml = request.data.decode('utf-8') + result_id = speakSsml(ssml, client_id, True) + return Response(result_id, status=200) + except Exception as e: + return Response(f"Speak failed. Error message: {e}", status=400) + +# The API route to stop avatar from speaking +@app.route("/api/stopSpeaking", methods=["POST"]) +def stopSpeaking() -> Response: + global client_contexts + client_id = uuid.UUID(request.headers.get('ClientId')) + is_speaking = client_contexts[client_id]['is_speaking'] + if is_speaking: + stopSpeakingInternal(client_id) + return Response('Speaking stopped.', status=200) + +# The API route for chat +# It receives the user query and return the chat response. +# It returns response in stream, which yields the chat response in chunks. +@app.route("/api/chat", methods=["POST"]) +def chat() -> Response: + global client_contexts + client_id = uuid.UUID(request.headers.get('ClientId')) + client_context = client_contexts[client_id] + chat_initiated = client_context['chat_initiated'] + if not chat_initiated: + initializeChatContext(azure_openai_system_prompt, client_id) + client_context['chat_initiated'] = True + user_query = request.data.decode('utf-8') + return Response(handleUserQuery(user_query, client_id), mimetype='text/plain', status=200) + +# The API route to clear the chat history +@app.route("/api/chat/clearHistory", methods=["POST"]) +def clearChatHistory() -> Response: + global client_contexts + client_id = uuid.UUID(request.headers.get('ClientId')) + client_context = client_contexts[client_id] + initializeChatContext(azure_openai_system_prompt, client_id) + client_context['chat_initiated'] = True + return Response('Chat history cleared.', status=200) + +# The API route to disconnect the TTS avatar +@app.route("/api/disconnectAvatar", methods=["POST"]) +def disconnectAvatar() -> Response: + global client_contexts + client_id = uuid.UUID(request.headers.get('ClientId')) + client_context = client_contexts[client_id] + speech_synthesizer = client_context['speech_synthesizer'] + try: + connection = speechsdk.Connection.from_speech_synthesizer(speech_synthesizer) + connection.close() + return Response('Disconnected avatar', status=200) + except: + return Response(traceback.format_exc(), status=400) + +# Initialize the client by creating a client id and an initial context +def initializeClient() -> uuid.UUID: + client_id = uuid.uuid4() + client_contexts[client_id] = { + 'speech_synthesizer': None, # Speech synthesizer for avatar + 'speech_token': None, # Speech token for client side authentication with speech service + 'ice_token': None, # ICE token for ICE/TURN/Relay server connection + 'chat_initiated': False, # Flag to indicate if the chat context is initiated + 'messages': [], # Chat messages (history) + 'is_speaking': False, # Flag to indicate if the avatar is speaking + 'spoken_text_queue': [], # Queue to store the spoken text + 'speaking_thread': None, # The thread to speak the spoken text queue + 'last_speak_time': None, # The last time the avatar spoke + 'data_sources': [] # Data sources for 'on your data' scenario + } + return client_id + +# Refresh the ICE token which being called +def refreshIceToken() -> None: + global ice_token + if speech_private_endpoint: + ice_token = requests.get(f'{speech_private_endpoint}/tts/cognitiveservices/avatar/relay/token/v1', headers={'Ocp-Apim-Subscription-Key': speech_key}).text + else: + ice_token = requests.get(f'https://{speech_region}.tts.speech.microsoft.com/cognitiveservices/avatar/relay/token/v1', headers={'Ocp-Apim-Subscription-Key': speech_key}).text + +# Refresh the speech token every 9 minutes +def refreshSpeechToken() -> None: + global speech_token + while True: + # Refresh the speech token every 9 minutes + if speech_private_endpoint: + credential = DefaultAzureCredential(managed_identity_client_id=user_assigned_managed_identity_client_id) + token = credential.get_token('https://cognitiveservices.azure.com/.default') + speech_token = f'aad#{speech_resource_url}#{token.token}' + else: + speech_token = requests.post(f'https://{speech_region}.api.cognitive.microsoft.com/sts/v1.0/issueToken', headers={'Ocp-Apim-Subscription-Key': speech_key}).text + time.sleep(60 * 9) + +# Initialize the chat context, e.g. chat history (messages) +def initializeChatContext(system_prompt: str, client_id: uuid.UUID) -> None: + global client_contexts + client_context = client_contexts[client_id] + messages = client_context['messages'] + data_sources = client_context['data_sources'] + + # Initialize data sources for 'on your data' scenario + data_sources.clear() + if cognitive_search_endpoint and cognitive_search_api_key and cognitive_search_index_name: + # On-your-data scenario + data_source = { + 'type': 'azure_search', + 'parameters': { + 'endpoint': cognitive_search_endpoint, + 'index_name': cognitive_search_index_name, + 'authentication': { + 'type': 'api_key', + 'key': cognitive_search_api_key + }, + 'semantic_configuration': '', + 'query_type': 'simple', + 'fields_mapping': { + 'content_fields_separator': '\n', + 'content_fields': ['chunk'], + 'filepath_field': None, + 'title_field': 'title', + 'url_field': None + }, + 'in_scope': True, + 'role_information': system_prompt + } + } + data_sources.append(data_source) + + # Initialize messages + messages.clear() + if len(data_sources) == 0: + system_message = { + 'role': 'system', + 'content': system_prompt + } + messages.append(system_message) + + +# Handle the user query and return the assistant reply. For chat scenario. +# The function is a generator, which yields the assistant reply in chunks. +def handleUserQuery(user_query: str, client_id: uuid.UUID): + + global client_contexts + client_context = client_contexts[client_id] + messages = client_context['messages'] + data_sources = client_context['data_sources'] + + chat_message = { + 'role': 'user', + 'content': user_query + } + + messages.append(chat_message) + + if enable_quick_reply: + speakWithQueue(random.choice(quick_replies), 2000) + + assistant_reply = '' + spoken_sentence = '' + tool_content = '' + + aoai_start_time = datetime.datetime.now(pytz.UTC) + response = azure_openai.chat.completions.create( + model=azure_openai_deployment_name, + messages=messages, + extra_body={ 'data_sources' : data_sources } if len(data_sources) > 0 else None, + max_tokens = 150, + temperature = 0, + stream=True) + + is_first_chunk = True + is_first_sentence = True + for chunk in response: + if len(chunk.choices) > 0: + response_token = chunk.choices[0].delta.content + if response_token is not None: + # Log response_token here if need debug + if is_first_chunk: + first_token_latency_ms = round((datetime.datetime.now(pytz.UTC) - aoai_start_time).total_seconds() * 1000) + print(f"AOAI first token latency: {first_token_latency_ms}ms") + yield f"{first_token_latency_ms}" + is_first_chunk = False + if oyd_doc_regex.search(response_token): + response_token = oyd_doc_regex.sub('', response_token).strip() + if symbol_regex.search(response_token): + response_token = symbol_regex.sub('', response_token).strip() + yield response_token # yield response token to client as display text + assistant_reply += response_token # build up the assistant message + if response_token == '\n' or response_token == '\n\n': + if is_first_sentence: + first_sentence_latency_ms = round((datetime.datetime.now(pytz.UTC) - aoai_start_time).total_seconds() * 1000) + print(f"AOAI first sentence latency: {first_sentence_latency_ms}ms") + yield f"{first_sentence_latency_ms}" + is_first_sentence = False + speakWithQueue(spoken_sentence.strip(), 0, client_id) + spoken_sentence = '' + else: + response_token = response_token.replace('\n', '') + spoken_sentence += response_token # build up the spoken sentence + if len(response_token) == 1 or len(response_token) == 2: + for punctuation in sentence_level_punctuations: + if response_token.startswith(punctuation): + if is_first_sentence: + first_sentence_latency_ms = round((datetime.datetime.now(pytz.UTC) - aoai_start_time).total_seconds() * 1000) + print(f"AOAI first sentence latency: {first_sentence_latency_ms}ms") + yield f"{first_sentence_latency_ms}" + is_first_sentence = False + speakWithQueue(spoken_sentence.strip(), 0, client_id) + spoken_sentence = '' + break + + if spoken_sentence != '': + speakWithQueue(spoken_sentence.strip(), 0, client_id) + spoken_sentence = '' + + if len(data_sources) > 0: + tool_message = { + 'role': 'tool', + 'content': tool_content + } + messages.append(tool_message) + + assistant_message = { + 'role': 'assistant', + 'content': assistant_reply + } + messages.append(assistant_message) + +# Speak the given text. If there is already a speaking in progress, add the text to the queue. For chat scenario. +def speakWithQueue(text: str, ending_silence_ms: int, client_id: uuid.UUID) -> None: + global client_contexts + client_context = client_contexts[client_id] + spoken_text_queue = client_context['spoken_text_queue'] + is_speaking = client_context['is_speaking'] + spoken_text_queue.append(text) + if not is_speaking: + def speakThread(): + nonlocal client_context + nonlocal spoken_text_queue + nonlocal ending_silence_ms + # tts_voice = client_context['tts_voice'] + # personal_voice_speaker_profile_id = client_context['personal_voice_speaker_profile_id'] + client_context['is_speaking'] = True + while len(spoken_text_queue) > 0: + text = spoken_text_queue.pop(0) + speakText(text, tts_voice, personal_voice_speaker_profile_id, ending_silence_ms, client_id) + client_context['last_speak_time'] = datetime.datetime.now(pytz.UTC) + client_context['is_speaking'] = False + client_context['speaking_thread'] = threading.Thread(target=speakThread) + client_context['speaking_thread'].start() + +# Speak the given text. +def speakText(text: str, voice: str, speaker_profile_id: str, ending_silence_ms: int, client_id: uuid.UUID) -> str: + ssml = f""" + + + + {html.escape(text)} + + + """ + if ending_silence_ms > 0: + ssml = f""" + + + + {html.escape(text)} + + + + """ + return speakSsml(ssml, client_id, False) + +# Speak the given ssml with speech sdk +def speakSsml(ssml: str, client_id: uuid.UUID, asynchronized: bool) -> str: + global client_contexts + speech_synthesizer = client_contexts[client_id]['speech_synthesizer'] + speech_sythesis_result = speech_synthesizer.start_speaking_ssml_async(ssml).get() if asynchronized else speech_synthesizer.speak_ssml_async(ssml).get() + if speech_sythesis_result.reason == speechsdk.ResultReason.Canceled: + cancellation_details = speech_sythesis_result.cancellation_details + print(f"Speech synthesis canceled: {cancellation_details.reason}") + if cancellation_details.reason == speechsdk.CancellationReason.Error: + print(f"Result ID: {speech_sythesis_result.result_id}. Error details: {cancellation_details.error_details}") + raise Exception(cancellation_details.error_details) + return speech_sythesis_result.result_id + +# Stop speaking internal function +def stopSpeakingInternal(client_id: uuid.UUID) -> None: + global client_contexts + client_context = client_contexts[client_id] + speech_synthesizer = client_context['speech_synthesizer'] + spoken_text_queue = client_context['spoken_text_queue'] + spoken_text_queue.clear() + try: + connection = speechsdk.Connection.from_speech_synthesizer(speech_synthesizer) + connection.send_message_async('synthesis.control', '{"action":"stop"}').get() + except: + print("Sending message through connection object is not yet supported by current Speech SDK.") + +# Start the speech token refresh thread +speechTokenRefereshThread = threading.Thread(target=refreshSpeechToken) +speechTokenRefereshThread.daemon = True +speechTokenRefereshThread.start() + +# Fetch ICE token at startup +refreshIceToken() \ No newline at end of file diff --git a/avatar/streaming/assets/landscape.png b/avatar/streaming/assets/landscape.png new file mode 100644 index 00000000..92420062 Binary files /dev/null and b/avatar/streaming/assets/landscape.png differ diff --git a/avatar/streaming/assets/portrait.png b/avatar/streaming/assets/portrait.png new file mode 100644 index 00000000..146af83b Binary files /dev/null and b/avatar/streaming/assets/portrait.png differ diff --git a/avatar/streaming/index.html b/avatar/streaming/index.html new file mode 100644 index 00000000..4577a6e2 --- /dev/null +++ b/avatar/streaming/index.html @@ -0,0 +1,77 @@ + + + + + + AI Avatar + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + +
+ + + + + + +
+
+
+ + + + + + \ No newline at end of file diff --git a/avatar/streaming/portrait.html b/avatar/streaming/portrait.html new file mode 100644 index 00000000..e36b1f9c --- /dev/null +++ b/avatar/streaming/portrait.html @@ -0,0 +1,79 @@ + + + + + + AI Avatar + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + +
+ + + + + + +
+
+
+ + + + + + \ No newline at end of file diff --git a/avatar/streaming/requirements.txt b/avatar/streaming/requirements.txt new file mode 100644 index 00000000..f1fd75bc --- /dev/null +++ b/avatar/streaming/requirements.txt @@ -0,0 +1,7 @@ +azure-cognitiveservices-speech +azure-identity +flask +openai +pytz +requests +python-dotenv \ No newline at end of file diff --git a/avatar/streaming/static/css/portrait_styles.css b/avatar/streaming/static/css/portrait_styles.css new file mode 100644 index 00000000..3f52013a --- /dev/null +++ b/avatar/streaming/static/css/portrait_styles.css @@ -0,0 +1,189 @@ +/* General Body Styling*/ +body { + font-family: 'Segoe UI', sans-serif; + background-image: var(--background-image); + background-repeat: no-repeat; + background-size: cover; + margin: 0; + padding: 0; + display: flex; + flex-direction: column; + align-items: center; /* Center children horizontally */ + justify-content: center; /* Center children vertically */ + height: 100vh; + overflow: hidden; /* Adjust if necessary to handle overflow */ + position: relative; /* Allow for overlay positioning */ +} + +body::after { + content: ""; + position: absolute; + top: 0; + left: 0; + right: 0; + bottom: 0; + background: rgba(255, 255, 255, 0.05); /* Dark overlay */ + z-index: 1; /* Below content but above the background */ +} + +.avatar-block { + flex: 1; + display: flex; + align-items: center; /* Center content vertically */ + justify-content: center; /* Center content horizontally */ + width: 200%; /* Full width */ + height: 100%; /* Full height of the viewport */ + position: relative; + overflow: hidden; /* Prevent overflow */ + z-index: 2; +} + +video { + background: #222; + margin: 0 0 20px 0; + --width: 100%; + width: var(--width); + height: calc(var(--width) * 0.75); +} + +canvas { + position: absolute; /* Position it absolutely within its positioned container */ + bottom: 0; + width: 100%; /* Fill the width of its container */ + height: 100%; /* Adjust height to maintain the aspect ratio */ + object-fit: fill; /* Cover the entire area of the canvas without being cut off */ + background-color: transparent; /* Maintain a transparent background if necessary */ + display: block; /* Ensure the canvas is displayed */ + z-index: 3; /* Positioned correctly within the stacking context */ +} + +#remoteVideo { + display: none; +} + +.record-card { + position: fixed; + bottom: 0; + left: 0; + width: 100%; + padding: 10px; + z-index: 100; + background: rgba(255, 255, 255, 0.1); + backdrop-filter: blur(10px); + box-shadow: 0 -4px 30px rgba(0, 0, 0, 0.4); +} + +.button-container { + display: flex; + justify-content: center; + align-items: center; + gap: 15px; + flex-wrap: nowrap; + width: 100%; + overflow: hidden; +} + +.icon-button { + background-color: var(--button-color); + color: var(--button-icon-color); + border: none; + border-radius: 15px; + padding: 14px; + cursor: pointer; + display: flex; + align-items: center; + justify-content: center; + font-size: 20px; + transition: background-color 0.3s ease, transform 0.3s ease; + box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2); + position: relative; + z-index: 2; +} + +.icon-button:hover { + background-color: var(--button-hover-color); + transform: scale(1.1); +} + +.icon-button[disabled] { + background-color: #ccc; + cursor: not-allowed; + box-shadow: none; +} + +#chat-box { + position: fixed; + bottom: 80px; + right: 20px; + width: 30%; /* Adjusted for better fit in portrait */ + height: 25%; /* Dynamic height adjustment */ + padding: 10px; + background-color: rgba(255, 255, 255, 0.2); + border-radius: 12px; + box-shadow: 0 4px 15px rgba(0, 0, 0, 0.5); + backdrop-filter: blur(10px); + border: 1px solid rgba(255, 255, 255, 0.25); + overflow: hidden; + display: flex; + flex-direction: column; + z-index: 999; +} + +#chatHistory, #userMessageBox { + background-color: transparent; + color: var(--chat-color); + padding: 10px; + margin: 2px 0; + border: none; + outline: none; + width: 100%; + resize: none; + scrollbar-width: thin; + font-family: 'Segoe UI', sans-serif; +} + +#chatHistory { + overflow-y: auto; + height: 250px; +} + +#chatHistory::-webkit-scrollbar { + width: 8px; +} + +#chatHistory::-webkit-scrollbar-thumb { + background-color: #888; + border-radius: 5px; +} + +#chatHistory::-webkit-scrollbar-track { + background-color: #2C2C2C; +} + +@media (orientation: portrait) { + body, .avatar-block { + flex-direction: column; + align-items: center; + justify-content: center; + } + + canvas, video { + width: 100%; + height: auto; + } +} + +@media (max-width: 768px) { + .avatar-block { + width: 100%; + } +} + +/* Hidden Elements */ +.hidden { + display: none !important; +} + +#latencyLog { + display: none; +} \ No newline at end of file diff --git a/avatar/streaming/static/css/styles.css b/avatar/streaming/static/css/styles.css new file mode 100644 index 00000000..1874fbbf --- /dev/null +++ b/avatar/streaming/static/css/styles.css @@ -0,0 +1,181 @@ +/* General Body Styling*/ +body { + font-family: 'Segoe UI', sans-serif; + background-image: var(--background-image); + background-repeat: no-repeat; + background-size: cover; + margin: 0; + padding: 0; + display: flex; + flex-direction: column; + align-items: center; /* Center children horizontally */ + justify-content: center; /* Center children vertically */ + height: 100vh; + overflow: hidden; /* Adjust if necessary to handle overflow */ + position: relative; /* Allow for overlay positioning */ +} + +.avatar-block { + flex: 1; + display: flex; + align-items: center; /* Center content vertically */ + justify-content: center; /* Center content horizontally */ + width: 70%; /* Full width */ + height: 100%; /* Full height of the viewport */ + position: relative; + overflow: hidden; /* Prevent overflow */ + z-index: 2; +} + +video { + background: #222; + margin: 0 0 20px 0; + --width: 100%; + width: var(--width); + height: calc(var(--width) * 0.75); +} + +canvas { + position: absolute; /* Position it absolutely within its positioned container */ + bottom: 0; + width: 100%; /* Fill the width of its container */ + height: auto; /* Adjust height to maintain the aspect ratio */ + object-fit: cover; /* Cover the entire area of the canvas without being cut off */ + background-color: transparent; /* Maintain a transparent background if necessary */ + display: block; /* Ensure the canvas is displayed */ + z-index: 3; /* Positioned correctly within the stacking context */ +} + +@media (max-width: 768px) { + video, canvas { + max-width: 100%; + height: auto; + } +} + +#remoteVideo { + display: none; +} + +/* Add background overlay for better readability */ +body::after { + content: ""; + position: absolute; + top: 0; + left: 0; + right: 0; + bottom: 0; + background: rgba(255, 255, 255, 0.05); /* Dark overlay */ + z-index: 1; /* Below content but above the background */ +} + +/* Fixed position for the record card with control buttons */ +.record-card { + position: fixed; + bottom: 0; + left: 0; + width: 100%; + padding: 10px; + z-index: 100; + background: rgba(255, 255, 255, 0.1); + backdrop-filter: blur(10px); + box-shadow: 0 -4px 30px rgba(0, 0, 0, 0.4); +} + +/* Control buttons */ +.button-container { + display: flex; + justify-content: center; + align-items: center; + gap: 15px; + flex-wrap: nowrap; + width: 100%; + overflow: hidden; +} + +.icon-button { + background-color: var(--button-color); + color: var(--button-icon-color); + border: none; + border-radius: 15px; + padding: 14px; + cursor: pointer; + display: flex; + align-items: center; + justify-content: center; + font-size: 20px; + transition: background-color 0.3s ease, transform 0.3s ease; + box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2); + position: relative; + z-index: 2; +} + +.icon-button:hover { + background-color: var(--button-hover-color); + transform: scale(1.1); +} + +.icon-button[disabled] { + background-color: #ccc; + cursor: not-allowed; + box-shadow: none; +} + +/* Chat Box Styling for Overlapping Dialog */ +#chat-box { + position: fixed; + bottom: 80px; /* Adjust position as needed */ + right: 20px; /* Adjust position as needed */ + width: 300px; + height: 400px; + padding: 10px; + background-color: rgba(255, 255, 255, 0.2); /* Semi-transparent white background */ + border-radius: 12px; /* Rounded corners */ + box-shadow: 0 4px 15px rgba(0, 0, 0, 0.5); /* Soft shadow for depth */ + backdrop-filter: blur(10px); /* Blur effect on the background content */ + border: 1px solid rgba(255, 255, 255, 0.25); /* Subtle border for visibility */ + overflow: hidden; /* Keeps all content within the box */ + display: flex; + flex-direction: column; + z-index: 999; +} + +#chatHistory, #userMessageBox { + background-color: transparent; /* Make background transparent */ + color: var(--chat-color); + padding: 10px; /* Padding for spacing */ + margin: 2px 0; /* Space between elements */ + border: none; /* Remove borders */ + outline: none; /* Remove outline */ + width: 300px; + resize: none; + scrollbar-width: thin; + font-family: 'Segoe UI', sans-serif; +} + +#chatHistory { + overflow-y: auto; /* Allow scrolling */ + height: 250px; /* Fixed height with scrolling */ +} + +#chatHistory::-webkit-scrollbar { + width: 8px; +} + +#chatHistory::-webkit-scrollbar-thumb { + background-color: #888; + border-radius: 5px; +} + +#chatHistory::-webkit-scrollbar-track { + background-color: #2C2C2C; +} + +/* Hidden elements */ +#latencyLog { + display: none; +} + +.hidden { + display: none !important; +} \ No newline at end of file diff --git a/avatar/streaming/static/image/favicon.ico b/avatar/streaming/static/image/favicon.ico new file mode 100644 index 00000000..21b75e23 Binary files /dev/null and b/avatar/streaming/static/image/favicon.ico differ diff --git a/avatar/streaming/static/image/landscape.png b/avatar/streaming/static/image/landscape.png new file mode 100644 index 00000000..07b94aa0 Binary files /dev/null and b/avatar/streaming/static/image/landscape.png differ diff --git a/avatar/streaming/static/image/logo.png b/avatar/streaming/static/image/logo.png new file mode 100644 index 00000000..2a7e0319 Binary files /dev/null and b/avatar/streaming/static/image/logo.png differ diff --git a/avatar/streaming/static/image/portrait.png b/avatar/streaming/static/image/portrait.png new file mode 100644 index 00000000..b87fc632 Binary files /dev/null and b/avatar/streaming/static/image/portrait.png differ diff --git a/avatar/streaming/static/js/chat.js b/avatar/streaming/static/js/chat.js new file mode 100644 index 00000000..44e937bb --- /dev/null +++ b/avatar/streaming/static/js/chat.js @@ -0,0 +1,707 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. + +// Global objects +var clientId +var speechRecognizer +var peerConnection +var isSpeaking = false +var sessionActive = false +var recognitionStartedTime +var chatResponseReceivedTime +var lastSpeakTime +var isFirstRecognizingEvent = true +var firstTokenLatencyRegex = new RegExp(/(\d+)<\/FTL>/) +var firstSentenceLatencyRegex = new RegExp(/(\d+)<\/FSL>/) +var previousAnimationFrameTimestamp = 0; + +// Connect to avatar service +function connectAvatar() { + document.getElementById('startSession').disabled = true + + fetch('/api/getIceToken', { + method: 'GET', + }) + .then(response => { + if (response.ok) { + response.json().then(data => { + const iceServerUrl = data.Urls[0] + const iceServerUsername = data.Username + const iceServerCredential = data.Password + setupWebRTC(iceServerUrl, iceServerUsername, iceServerCredential) + }) + } else { + throw new Error(`Failed fetching ICE token: ${response.status} ${response.statusText}`) + } + }) + + document.getElementById('configuration').hidden = true +} + +// Create speech recognizer +function createSpeechRecognizer() { + fetch('/api/getSpeechToken', { + method: 'GET', + }) + .then(response => { + if (response.ok) { + const speechRegion = response.headers.get('SpeechRegion') + const speechPrivateEndpoint = response.headers.get('SpeechPrivateEndpoint') + response.text().then(text => { + const speechToken = text + const speechRecognitionConfig = speechPrivateEndpoint ? + SpeechSDK.SpeechConfig.fromEndpoint(new URL(`wss://${speechPrivateEndpoint.replace('https://', '')}/stt/speech/universal/v2`), '') : + SpeechSDK.SpeechConfig.fromEndpoint(new URL(`wss://${speechRegion}.stt.speech.microsoft.com/speech/universal/v2`), '') + speechRecognitionConfig.authorizationToken = speechToken + speechRecognitionConfig.setProperty(SpeechSDK.PropertyId.SpeechServiceConnection_LanguageIdMode, "Continuous") + speechRecognitionConfig.setProperty("SpeechContext-PhraseDetection.TrailingSilenceTimeout", "3000") + speechRecognitionConfig.setProperty("SpeechContext-PhraseDetection.InitialSilenceTimeout", "10000") + speechRecognitionConfig.setProperty("SpeechContext-PhraseDetection.Dictation.Segmentation.Mode", "Custom") + speechRecognitionConfig.setProperty("SpeechContext-PhraseDetection.Dictation.Segmentation.SegmentationSilenceTimeoutMs", "200") + var sttLocales = document.getElementById('sttLocales').value.split(',') + var autoDetectSourceLanguageConfig = SpeechSDK.AutoDetectSourceLanguageConfig.fromLanguages(sttLocales) + speechRecognizer = SpeechSDK.SpeechRecognizer.FromConfig(speechRecognitionConfig, autoDetectSourceLanguageConfig, SpeechSDK.AudioConfig.fromDefaultMicrophoneInput()) + }) + } else { + throw new Error(`Failed fetching speech token: ${response.status} ${response.statusText}`) + } + }) +} + +// Disconnect from avatar service +function disconnectAvatar(closeSpeechRecognizer = false) { + fetch('/api/disconnectAvatar', { + method: 'POST', + headers: { + 'ClientId': clientId + }, + body: '' + }) + + if (speechRecognizer !== undefined) { + speechRecognizer.stopContinuousRecognitionAsync() + if (closeSpeechRecognizer) { + speechRecognizer.close() + } + } + + sessionActive = false +} + +// Setup WebRTC +function setupWebRTC(iceServerUrl, iceServerUsername, iceServerCredential) { + // Create WebRTC peer connection + peerConnection = new RTCPeerConnection({ + iceServers: [{ + urls: [ iceServerUrl ], + username: iceServerUsername, + credential: iceServerCredential + }], + iceTransportPolicy: 'relay' + }) + + // Fetch WebRTC video stream and mount it to an HTML video element + peerConnection.ontrack = function (event) { + if (event.track.kind === 'audio') { + let audioElement = document.createElement('audio') + audioElement.id = 'audioPlayer' + audioElement.srcObject = event.streams[0] + audioElement.autoplay = true + + audioElement.onplaying = () => { + console.log(`WebRTC ${event.track.kind} channel connected.`) + } + + // Clean up existing audio element if there is any + remoteVideoDiv = document.getElementById('remoteVideo') + for (var i = 0; i < remoteVideoDiv.childNodes.length; i++) { + if (remoteVideoDiv.childNodes[i].localName === event.track.kind) { + remoteVideoDiv.removeChild(remoteVideoDiv.childNodes[i]) + } + } + + // Append the new audio element + document.getElementById('remoteVideo').appendChild(audioElement) + } + + if (event.track.kind === 'video') { + let videoElement = document.createElement('video') + videoElement.id = 'videoPlayer' + videoElement.srcObject = event.streams[0] + videoElement.autoplay = true + videoElement.playsInline = true + + videoElement.onplaying = () => { + // Clean up existing video element if there is any + remoteVideoDiv = document.getElementById('remoteVideo') + canvas = document.getElementById('canvas') + remoteVideoDiv.style.width = '0.1px' + canvas.hidden = false + + for (var i = 0; i < remoteVideoDiv.childNodes.length; i++) { + if (remoteVideoDiv.childNodes[i].localName === event.track.kind) { + remoteVideoDiv.removeChild(remoteVideoDiv.childNodes[i]) + } + } + + window.requestAnimationFrame(makeBackgroundTransparent) + + // Append the new video element + document.getElementById('remoteVideo').appendChild(videoElement) + + console.log(`WebRTC ${event.track.kind} channel connected.`) + document.getElementById('microphone').disabled = false + document.getElementById('stopSession').disabled = false + document.getElementById('chatHistory').hidden = false + document.getElementById('latencyLog').hidden = false + document.getElementById('showTypeMessage').disabled = false + + if (document.getElementById('useLocalVideoForIdle').checked) { + document.getElementById('localVideo').hidden = true + if (lastSpeakTime === undefined) { + lastSpeakTime = new Date() + } + } + + setTimeout(() => { sessionActive = true }, 5000) // Set session active after 5 seconds + } + } + } + + // Listen to data channel, to get the event from the server + peerConnection.addEventListener("datachannel", event => { + const dataChannel = event.channel + dataChannel.onmessage = e => { + console.log("[" + (new Date()).toISOString() + "] WebRTC event received: " + e.data) + + if (e.data.includes("EVENT_TYPE_SWITCH_TO_SPEAKING")) { + if (chatResponseReceivedTime !== undefined) { + let speakStartTime = new Date() + let ttsLatency = speakStartTime - chatResponseReceivedTime + console.log(`TTS latency: ${ttsLatency} ms`) + let latencyLogTextArea = document.getElementById('latencyLog') + latencyLogTextArea.innerHTML += `TTS latency: ${ttsLatency} ms\n\n` + latencyLogTextArea.scrollTop = latencyLogTextArea.scrollHeight + chatResponseReceivedTime = undefined + } + + isSpeaking = true + document.getElementById('stopSpeaking').disabled = false + } else if (e.data.includes("EVENT_TYPE_SWITCH_TO_IDLE")) { + isSpeaking = false + lastSpeakTime = new Date() + document.getElementById('stopSpeaking').disabled = true + } + } + }) + + // This is a workaround to make sure the data channel listening is working by creating a data channel from the client side + c = peerConnection.createDataChannel("eventChannel") + + // Make necessary update to the web page when the connection state changes + peerConnection.oniceconnectionstatechange = e => { + console.log("WebRTC status: " + peerConnection.iceConnectionState) + if (peerConnection.iceConnectionState === 'disconnected') { + if (document.getElementById('useLocalVideoForIdle').checked) { + document.getElementById('localVideo').hidden = false + document.getElementById('remoteVideo').style.width = '0.1px' + } + } + } + + // Offer to receive 1 audio, and 1 video track + peerConnection.addTransceiver('video', { direction: 'sendrecv' }) + peerConnection.addTransceiver('audio', { direction: 'sendrecv' }) + + // Connect to avatar service when ICE candidates gathering is done + iceGatheringDone = false + + peerConnection.onicecandidate = e => { + if (!e.candidate && !iceGatheringDone) { + iceGatheringDone = true + connectToAvatarService(peerConnection) + } + } + + peerConnection.createOffer().then(sdp => { + peerConnection.setLocalDescription(sdp).then(() => { setTimeout(() => { + if (!iceGatheringDone) { + iceGatheringDone = true + connectToAvatarService(peerConnection) + } + }, 2000) }) + }) +} + +// Connect to TTS Avatar Service +function connectToAvatarService(peerConnection) { + let localSdp = btoa(JSON.stringify(peerConnection.localDescription)) + let headers = { + 'ClientId': clientId + } + + fetch('/api/connectAvatar', { + method: 'POST', + headers: headers, + body: localSdp + }) + .then(response => { + if (response.ok) { + response.text().then(text => { + const remoteSdp = text + peerConnection.setRemoteDescription(new RTCSessionDescription(JSON.parse(atob(remoteSdp)))) + .then(() => { + sendGreetingMessage(); + }); + }) + } else { + document.getElementById('startSession').disabled = false; + document.getElementById('configuration').hidden = false; + throw new Error(`Failed connecting to the Avatar service: ${response.status} ${response.statusText}`) + } + }) +} + +// Function to send a greeting message +function sendGreetingMessage() { + const greetingText = `Hi, my name is Lisa. How can I help you today?`; + let ssml = `${greetingText}`; + + fetch('/api/speak', { + method: 'POST', + headers: { + 'ClientId': clientId, + 'Content-Type': 'application/ssml+xml' + }, + body: ssml + }) + .then(response => { + if (response.ok) { + console.log('Greeting message sent successfully.'); + } else { + console.error('Failed to send greeting message.'); + } + }); +} + +// Handle user query. Send user query to the chat API and display the response. +function handleUserQuery(userQuery) { + let chatRequestSentTime = new Date() + fetch('/api/chat', { + method: 'POST', + headers: { + 'ClientId': clientId, + 'Content-Type': 'text/plain' + }, + body: userQuery + }) + .then(response => { + if (!response.ok) { + throw new Error(`Chat API response status: ${response.status} ${response.statusText}`) + } + + let chatHistoryTextArea = document.getElementById('chatHistory') + chatHistoryTextArea.innerHTML += 'Assistant: ' + + const reader = response.body.getReader() + + // Function to recursively read chunks from the stream + function read() { + return reader.read().then(({ value, done }) => { + // Check if there is still data to read + if (done) { + // Stream complete + return + } + + // Process the chunk of data (value) + let chunkString = new TextDecoder().decode(value, { stream: true }) + + if (firstTokenLatencyRegex.test(chunkString)) { + let aoaiFirstTokenLatency = parseInt(firstTokenLatencyRegex.exec(chunkString)[0].replace('', '').replace('', '')) + // console.log(`AOAI first token latency: ${aoaiFirstTokenLatency} ms`) + chunkString = chunkString.replace(firstTokenLatencyRegex, '') + if (chunkString === '') { + return read() + } + } + + if (firstSentenceLatencyRegex.test(chunkString)) { + let aoaiFirstSentenceLatency = parseInt(firstSentenceLatencyRegex.exec(chunkString)[0].replace('', '').replace('', '')) + chatResponseReceivedTime = new Date() + let chatLatency = chatResponseReceivedTime - chatRequestSentTime + let appServiceLatency = chatLatency - aoaiFirstSentenceLatency + console.log(`App service latency: ${appServiceLatency} ms`) + console.log(`AOAI latency: ${aoaiFirstSentenceLatency} ms`) + let latencyLogTextArea = document.getElementById('latencyLog') + latencyLogTextArea.innerHTML += `App service latency: ${appServiceLatency} ms\n` + latencyLogTextArea.innerHTML += `AOAI latency: ${aoaiFirstSentenceLatency} ms\n` + latencyLogTextArea.scrollTop = latencyLogTextArea.scrollHeight + chunkString = chunkString.replace(firstSentenceLatencyRegex, '') + if (chunkString === '') { + return read() + } + } + + chatHistoryTextArea.innerHTML += `${chunkString}` + chatHistoryTextArea.scrollTop = chatHistoryTextArea.scrollHeight + + // Continue reading the next chunk + return read() + }) + } + + // Start reading the stream + return read() + }) +} + +// Handle local video. If the user is not speaking for 15 seconds, switch to local video. +function handleLocalVideo() { + if (lastSpeakTime === undefined) { + return + } + + let currentTime = new Date() + if (currentTime - lastSpeakTime > 15000) { + if (document.getElementById('useLocalVideoForIdle').checked && sessionActive && !isSpeaking) { + disconnectAvatar() + document.getElementById('localVideo').hidden = false + document.getElementById('remoteVideo').style.width = '0.1px' + sessionActive = false + } + } +} + +// Check whether the avatar video stream is hung +function checkHung() { + // Check whether the avatar video stream is hung, by checking whether the video time is advancing + let videoElement = document.getElementById('videoPlayer') + if (videoElement !== null && videoElement !== undefined && sessionActive) { + let videoTime = videoElement.currentTime + setTimeout(() => { + // Check whether the video time is advancing + if (videoElement.currentTime === videoTime) { + // Check whether the session is active to avoid duplicatedly triggering reconnect + if (sessionActive) { + sessionActive = false + if (document.getElementById('autoReconnectAvatar').checked) { + console.log(`[${(new Date()).toISOString()}] The video stream got disconnected, need reconnect.`) + connectAvatar() + createSpeechRecognizer() + } + } + } + }, 2000) + } +} + +function checkAndExecute() { + var checkbox = document.getElementById('showTypeMessage'); + if (checkbox.checked) { + window.updateTypeMessageBox(); + } + } + +function makeBackgroundTransparent(timestamp) { + if (!previousAnimationFrameTimestamp || timestamp - previousAnimationFrameTimestamp > 33) { + const video = document.getElementById('videoPlayer'); + const canvas = document.getElementById('canvas'); + if (video && canvas && video.videoWidth > 0 && video.videoHeight > 0) { + canvas.width = video.videoWidth; + canvas.height = video.videoHeight; + const context = canvas.getContext('2d'); + + // Clear the canvas + context.clearRect(0, 0, canvas.width, canvas.height); + + // Draw the video frame onto the canvas + context.drawImage(video, 0, 0, video.videoWidth, video.videoHeight); + let frame = context.getImageData(0, 0, video.videoWidth, video.videoHeight); + + // Process each pixel + for (let i = 0; i < frame.data.length; i += 4) { + let r = frame.data[i]; + let g = frame.data[i + 1]; + let b = frame.data[i + 2]; + let alpha = frame.data[i + 3]; + + // Strict green detection for transparency + if (g > 100 && g > r * 1.6 && g > b * 1.6) { + frame.data[i + 3] = 0; // Fully transparent for dominant green + } + // Soften edge pixels + else if (g > 80 && g > r * 1.4 && g > b * 1.4) { + frame.data[i + 3] = alpha * 0.1; // Partially transparent for soft edges + } + + // Green spill reduction + if (alpha > 0 && g > r * 1.2 && g > b * 1.2) { + let adjustment = (g - Math.max(r, b)) / 2; + frame.data[i] = Math.min(255, r + adjustment * 1.0); // Boost red + frame.data[i + 1] = Math.max(0, g - adjustment * 2.0); // Reduce green + frame.data[i + 2] = Math.min(255, b + adjustment * 1.0); // Boost blue + } + } + + context.putImageData(frame, 0, 0); + + // Apply edge-specific smoothing + smoothEdges(context, canvas.width, canvas.height); + } + previousAnimationFrameTimestamp = timestamp; + } + window.requestAnimationFrame(makeBackgroundTransparent); +} + +function smoothEdges(context, width, height) { + let frame = context.getImageData(0, 0, width, height); + let data = frame.data; + + for (let i = 0; i < data.length; i += 4) { + let alpha = data[i + 3]; + + // Only apply smoothing to semi-transparent pixels + if (alpha > 0 && alpha < 255) { + let surroundingAlpha = getSurroundingAlphaAverage(data, i, width); + // Smooth the alpha value by blending it with the surrounding pixels' alpha + data[i + 3] = (alpha + surroundingAlpha) / 2; + } + } + + context.putImageData(frame, 0, 0); +} + +function getSurroundingAlphaAverage(data, index, width) { + let totalAlpha = 0; + let count = 0; + + // Check surrounding pixels in a 3x3 grid + for (let y = -1; y <= 1; y++) { + for (let x = -1; x <= 1; x++) { + let neighborIndex = index + (y * width * 4) + (x * 4); + if (neighborIndex >= 0 && neighborIndex < data.length) { + totalAlpha += data[neighborIndex + 3]; + count++; + } + } + } + + return totalAlpha / count; // Average the alpha values of surrounding pixels +} + +window.onload = () => { + clientId = document.getElementById('clientId').value + setInterval(() => { + checkHung() + }, 2000) // Check session activity every 2 seconds + checkAndExecute() +} + +window.startSession = () => { + createSpeechRecognizer() + if (document.getElementById('useLocalVideoForIdle').checked) { + document.getElementById('startSession').disabled = true + document.getElementById('configuration').hidden = true + document.getElementById('microphone').disabled = false + document.getElementById('stopSession').disabled = false + document.getElementById('localVideo').hidden = false + document.getElementById('remoteVideo').style.width = '0.1px' + document.getElementById('chatHistory').hidden = false + document.getElementById('latencyLog').hidden = false + document.getElementById('showTypeMessage').disabled = false + return + } + + connectAvatar() +} + +window.stopSpeaking = () => { + document.getElementById('stopSpeaking').disabled = true + + fetch('/api/stopSpeaking', { + method: 'POST', + headers: { + 'ClientId': clientId + }, + body: '' + }) + .then(response => { + if (response.ok) { + console.log('Successfully stopped speaking.') + } else { + throw new Error(`Failed to stop speaking: ${response.status} ${response.statusText}`) + } + }) +} + +window.stopSession = () => { + document.getElementById('startSession').disabled = false + document.getElementById('microphone').disabled = true + document.getElementById('stopSession').disabled = true + document.getElementById('configuration').hidden = false + document.getElementById('chatHistory').hidden = true + document.getElementById('latencyLog').hidden = true + document.getElementById('showTypeMessage').checked = false + document.getElementById('showTypeMessage').disabled = true + document.getElementById('userMessageBox').hidden = true + if (document.getElementById('useLocalVideoForIdle').checked) { + document.getElementById('localVideo').hidden = true + } + + disconnectAvatar(true) +} + +window.clearChatHistory = () => { + fetch('/api/chat/clearHistory', { + method: 'POST', + headers: { + 'ClientId': clientId + }, + body: '' + }) + .then(response => { + if (response.ok) { + document.getElementById('chatHistory').innerHTML = '' + document.getElementById('latencyLog').innerHTML = '' + } else { + throw new Error(`Failed to clear chat history: ${response.status} ${response.statusText}`) + } + }) +} + +window.microphone = () => { + let microphoneButton = document.getElementById('microphone'); + let microphoneIcon = microphoneButton.querySelector('i'); + + if (microphoneIcon.classList.contains('fa-microphone-slash')) { + // Stop microphone + microphoneButton.disabled = true; + speechRecognizer.stopContinuousRecognitionAsync( + () => { + microphoneIcon.classList.remove('fa-microphone-slash'); + microphoneIcon.classList.add('fa-microphone'); + microphoneButton.disabled = false; + }, (err) => { + console.log("Failed to stop continuous recognition:", err); + microphoneButton.disabled = false; + }); + return; + } + + if (document.getElementById('useLocalVideoForIdle').checked) { + if (!sessionActive) { + connectAvatar(); + } + + setTimeout(() => { + document.getElementById('audioPlayer').play(); + }, 5000); + } else { + document.getElementById('audioPlayer').play(); + } + + microphoneButton.disabled = true; + speechRecognizer.recognizing = async (s, e) => { + if (isFirstRecognizingEvent && isSpeaking) { + window.stopSpeaking(); + isFirstRecognizingEvent = false; + } + }; + + speechRecognizer.recognized = async (s, e) => { + if (e.result.reason === SpeechSDK.ResultReason.RecognizedSpeech) { + let userQuery = e.result.text.trim(); + if (userQuery === '') { + return; + } + + let recognitionResultReceivedTime = new Date(); + let speechFinishedOffset = (e.result.offset + e.result.duration) / 10000; + let sttLatency = recognitionResultReceivedTime - recognitionStartedTime - speechFinishedOffset; + console.log(`STT latency: ${sttLatency} ms`); + let latencyLogTextArea = document.getElementById('latencyLog'); + latencyLogTextArea.innerHTML += `STT latency: ${sttLatency} ms\n`; + latencyLogTextArea.scrollTop = latencyLogTextArea.scrollHeight; + + // Auto stop microphone when a phrase is recognized, when it's not continuous conversation mode + if (!document.getElementById('continuousConversation').checked) { + microphoneButton.disabled = true; + speechRecognizer.stopContinuousRecognitionAsync( + () => { + microphoneIcon.classList.remove('fa-microphone-slash'); + microphoneIcon.classList.add('fa-microphone'); + microphoneButton.disabled = false; + }, (err) => { + console.log("Failed to stop continuous recognition:", err); + microphoneButton.disabled = false; + }); + } + + let chatHistoryTextArea = document.getElementById('chatHistory'); + if (chatHistoryTextArea.innerHTML !== '' && !chatHistoryTextArea.innerHTML.endsWith('\n\n')) { + chatHistoryTextArea.innerHTML += '\n\n'; + } + + chatHistoryTextArea.innerHTML += "User: " + userQuery + '\n\n'; + chatHistoryTextArea.scrollTop = chatHistoryTextArea.scrollHeight; + + handleUserQuery(userQuery); + + isFirstRecognizingEvent = true; + } + }; + + recognitionStartedTime = new Date(); + speechRecognizer.startContinuousRecognitionAsync( + () => { + microphoneIcon.classList.remove('fa-microphone'); + microphoneIcon.classList.add('fa-microphone-slash'); + microphoneButton.disabled = false; + }, (err) => { + console.log("Failed to start continuous recognition:", err); + microphoneButton.disabled = false; + }); +}; + +window.toggleChat = () => { + const chatBox = document.getElementById("chat-box"); + chatBox.style.display = chatBox.style.display === "none" ? "block" : "none"; +} + +window.updateTypeMessageBox = () => { + if (document.getElementById('showTypeMessage').checked) { + document.getElementById('userMessageBox').hidden = false + document.getElementById('userMessageBox').addEventListener('keyup', (e) => { + if (e.key === 'Enter') { + const userQuery = document.getElementById('userMessageBox').value + if (userQuery !== '') { + let chatHistoryTextArea = document.getElementById('chatHistory') + if (chatHistoryTextArea.innerHTML !== '' && !chatHistoryTextArea.innerHTML.endsWith('\n\n')) { + chatHistoryTextArea.innerHTML += '\n\n' + } + + chatHistoryTextArea.innerHTML += "User: " + userQuery.trim('\n') + '\n\n' + chatHistoryTextArea.scrollTop = chatHistoryTextArea.scrollHeight + + if (isSpeaking) { + window.stopSpeaking() + } + + handleUserQuery(userQuery.trim('\n')) + document.getElementById('userMessageBox').value = '' + } + } + }) + } else { + document.getElementById('userMessageBox').hidden = true + } +} + +window.updateLocalVideoForIdle = () => { + if (document.getElementById('useLocalVideoForIdle').checked) { + document.getElementById('showTypeMessageCheckbox').hidden = true + } else { + document.getElementById('showTypeMessageCheckbox').hidden = false + } +} \ No newline at end of file diff --git a/avatar/streaming/static/video/lisa-casual-sitting-idle.mp4 b/avatar/streaming/static/video/lisa-casual-sitting-idle.mp4 new file mode 100644 index 00000000..d007b189 Binary files /dev/null and b/avatar/streaming/static/video/lisa-casual-sitting-idle.mp4 differ