Skip to content

Commit 523115b

Browse files
feat: add WebRTC signaling, Firebase config updates, Human Expert connection flow
1 parent e32005b commit 523115b

15 files changed

Lines changed: 1157 additions & 288 deletions

File tree

.vscode/settings.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"editor.maxTokenizationLineLength": 20000
3+
}

app/build.gradle.kts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,12 @@ plugins {
55
id("org.jetbrains.kotlin.plugin.serialization") version "1.9.20"
66
id("com.google.android.libraries.mapsplatform.secrets-gradle-plugin")
77
id("kotlin-parcelize")
8+
id("com.google.gms.google-services")
89
}
910

11+
// Redirect build output to C: drive (NTFS) to avoid corrupted ExFAT build cache
12+
layout.buildDirectory = file("C:/GradleBuild/app")
13+
1014
android {
1115
namespace = "com.google.ai.sample"
1216
compileSdk = 35
@@ -96,4 +100,14 @@ dependencies {
96100

97101
// Camera Core to potentially fix missing JNI lib issue
98102
implementation("androidx.camera:camera-core:1.4.0")
103+
104+
// WebRTC
105+
implementation("io.getstream:stream-webrtc-android:1.1.1")
106+
107+
// WebSocket for signaling
108+
implementation("com.squareup.okhttp3:okhttp:4.12.0")
109+
110+
// Firebase
111+
implementation(platform("com.google.firebase:firebase-bom:32.7.2"))
112+
implementation("com.google.firebase:firebase-database")
99113
}

app/src/main/AndroidManifest.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
<?xml version="1.0" encoding="utf-8"?>
22
<manifest xmlns:android="http://schemas.android.com/apk/res/android"
33
xmlns:tools="http://schemas.android.com/tools">
4+
<uses-permission android:name="android.permission.INTERNET" />
45
<!-- Storage permissions -->
56
<uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" android:maxSdkVersion="32" />
67
<uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" android:maxSdkVersion="28" />

app/src/main/kotlin/com/google/ai/sample/ApiKeyDialog.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ fun ApiKeyDialog(
8888
ApiProvider.GOOGLE -> "https://makersuite.google.com/app/apikey"
8989
ApiProvider.CEREBRAS -> "https://cloud.cerebras.ai/"
9090
ApiProvider.VERCEL -> "https://vercel.com/ai-gateway"
91-
ApiProvider.HUMAN_EXPERT -> return@Button // No API key needed
91+
ApiProvider.HUMAN_EXPERT -> return@Button
9292
}
9393
val intent = Intent(Intent.ACTION_VIEW, Uri.parse(url))
9494
context.startActivity(intent)

app/src/main/kotlin/com/google/ai/sample/MainActivity.kt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,10 @@ class MainActivity : ComponentActivity() {
444444
if (result.resultCode == Activity.RESULT_OK && result.data != null) {
445445
val shouldTakeScreenshotOnThisStart = this@MainActivity.isProcessingExplicitScreenshotRequest
446446
Log.i(TAG, "MediaProjection permission granted. Starting ScreenCaptureService. Explicit request: $shouldTakeScreenshotOnThisStart")
447+
448+
// Notify ViewModel about the permission grant (for Human Expert WebRTC)
449+
photoReasoningViewModel?.onMediaProjectionPermissionGranted(result.resultCode, result.data!!)
450+
447451
val serviceIntent = Intent(this, ScreenCaptureService::class.java).apply {
448452
action = ScreenCaptureService.ACTION_START_CAPTURE
449453
putExtra(ScreenCaptureService.EXTRA_RESULT_CODE, result.resultCode)

app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt

Lines changed: 194 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,9 @@ import kotlinx.serialization.json.Json
7272
import kotlinx.serialization.json.JsonElement
7373
import kotlinx.serialization.json.jsonObject
7474
import kotlinx.serialization.json.jsonPrimitive
75+
import com.google.ai.sample.webrtc.WebRTCSender
76+
import com.google.ai.sample.webrtc.SignalingClient
77+
import org.webrtc.IceCandidate
7578

7679
class PhotoReasoningViewModel(
7780
application: Application,
@@ -85,6 +88,14 @@ class PhotoReasoningViewModel(
8588

8689
private var llmInference: LlmInference? = null
8790
private val TAG = "PhotoReasoningViewModel"
91+
92+
// WebRTC & Signaling
93+
private var webRTCSender: WebRTCSender? = null
94+
private var signalingClient: SignalingClient? = null
95+
private var lastMediaProjectionResultCode: Int = 0
96+
private var lastMediaProjectionResultData: Intent? = null
97+
98+
8899

89100
private fun Bitmap.toBase64(): String {
90101
val outputStream = ByteArrayOutputStream()
@@ -351,6 +362,10 @@ class PhotoReasoningViewModel(
351362
Log.w(TAG, "Error closing LlmInference", e)
352363
}
353364
llmInference = null
365+
366+
// WebRTC cleanup
367+
webRTCSender?.stop()
368+
signalingClient?.disconnect()
354369
}
355370

356371
private fun createChatWithSystemMessage(context: Context? = null): Chat {
@@ -509,8 +524,36 @@ class PhotoReasoningViewModel(
509524

510525
// Check for Human Expert model
511526
if (currentModel == ModelOption.HUMAN_EXPERT) {
512-
_uiState.value = PhotoReasoningUiState.Error("Human Expert mode is not yet connected. The Human Operator app is required.")
513-
return
527+
// If we already have a specialized session running, maybe just send the text?
528+
// For now, we assume the user hits "Send" to trigger the connection + task post.
529+
530+
// Initial task post message
531+
val userMessage = PhotoReasoningMessage(
532+
text = userInput,
533+
participant = PhotoParticipant.USER,
534+
imageUris = imageUrisForChat ?: emptyList(),
535+
isPending = false
536+
)
537+
_chatState.addMessage(userMessage)
538+
539+
_uiState.value = PhotoReasoningUiState.Loading
540+
541+
// We need to ensure we have MediaProjection permission.
542+
// The UI (PhotoReasoningScreen) calls requestMediaProjectionPermission before calling reason()
543+
// if permission is missing. So here we should ideally rely on onMediaProjectionPermissionGranted
544+
// having been called or already having the intent.
545+
546+
// But valid intent handling happens in onMediaProjectionPermissionGranted.
547+
// If reason() is called, it means we likely have permission or it was just granted.
548+
549+
// Check if we are already connected?
550+
if (signalingClient == null) {
551+
startHumanExpertSession(userInput)
552+
} else {
553+
// Already connected, just post the new task text or send via DataChannel if paired
554+
postTaskToHumanExpert(userInput)
555+
}
556+
return
514557
}
515558

516559
// Check for offline model (Gemma)
@@ -1097,9 +1140,155 @@ class PhotoReasoningViewModel(
10971140
}
10981141
}
10991142

1100-
/**
1101-
* Update the AI message in chat history
1102-
*/
1143+
// === Human Expert / WebRTC Logic ===
1144+
1145+
fun onMediaProjectionPermissionGranted(resultCode: Int, data: Intent) {
1146+
Log.d(TAG, "onMediaProjectionPermissionGranted: Storing result. Code=$resultCode")
1147+
lastMediaProjectionResultCode = resultCode
1148+
lastMediaProjectionResultData = data
1149+
1150+
// If we were waiting to start a session, we could start it here.
1151+
// For now, if the user just clicked "Human Expert" and granted permission,
1152+
// they might expect the connection to start.
1153+
// But startHumanExpertSession is already called in reason() if permission was already there.
1154+
// If permission wasn't there, reason() wasn't called (MainActivity blocked it?).
1155+
// Actually MainActivity.requestMediaProjectionPermission callback invokes the lambda passed to it.
1156+
// That lambda calls reason(). So reason() will be called immediately after this.
1157+
}
1158+
1159+
private fun startHumanExpertSession(taskText: String) {
1160+
if (lastMediaProjectionResultData == null) {
1161+
_uiState.value = PhotoReasoningUiState.Error("Screen capture permission required.")
1162+
return
1163+
}
1164+
1165+
if (signalingClient != null) {
1166+
// Already connected
1167+
postTaskToHumanExpert(taskText)
1168+
return
1169+
}
1170+
1171+
_uiState.value = PhotoReasoningUiState.Loading
1172+
_chatState.addMessage(PhotoReasoningMessage(text = "Connecting to Human Expert network...", participant = PhotoParticipant.MODEL, isPending = true))
1173+
_chatMessagesFlow.value = _chatState.getAllMessages()
1174+
1175+
// Initialize WebRTC Sender
1176+
webRTCSender = WebRTCSender(getApplication(), object : WebRTCSender.WebRTCSenderListener {
1177+
override fun onLocalICECandidate(candidate: IceCandidate) {
1178+
signalingClient?.sendICECandidate(candidate.sdp, candidate.sdpMid, candidate.sdpMLineIndex)
1179+
}
1180+
1181+
override fun onConnectionStateChanged(state: String) {
1182+
Log.d(TAG, "WebRTC State: $state")
1183+
viewModelScope.launch(Dispatchers.Main) {
1184+
if (state == "CONNECTED") {
1185+
_commandExecutionStatus.value = "Expert connected. Sharing screen."
1186+
replaceAiMessageText("Expert connected! They can now see your screen and control your device.", isPending = false)
1187+
} else if (state == "DISCONNECTED" || state == "FAILED") {
1188+
_commandExecutionStatus.value = "Expert disconnected."
1189+
// Keep the chat message as is
1190+
}
1191+
}
1192+
}
1193+
1194+
override fun onTapReceived(x: Float, y: Float) {
1195+
dispatchTap(x, y)
1196+
}
1197+
1198+
override fun onError(message: String) {
1199+
Log.e(TAG, "WebRTC Error: $message")
1200+
viewModelScope.launch(Dispatchers.Main) {
1201+
_uiState.value = PhotoReasoningUiState.Error("Video stream error: $message")
1202+
}
1203+
}
1204+
})
1205+
webRTCSender?.initialize()
1206+
1207+
// Initialize Signaling
1208+
// Initialize Signaling
1209+
signalingClient = SignalingClient(object : SignalingClient.SignalingListener {
1210+
override fun onTaskPosted(taskId: String) {
1211+
viewModelScope.launch(Dispatchers.Main) {
1212+
val msg = "Task posted. Waiting for an expert to claim it..."
1213+
replaceAiMessageText(msg, isPending = true)
1214+
}
1215+
}
1216+
1217+
override fun onTaskClaimed(taskId: String) {
1218+
Log.d(TAG, "Task claimed! Starting WebRTC negotiation.")
1219+
viewModelScope.launch(Dispatchers.Main) {
1220+
replaceAiMessageText("Expert found! Establishing video connection...", isPending = true)
1221+
}
1222+
1223+
// Start screen capture
1224+
webRTCSender?.startScreenCapture(lastMediaProjectionResultData!!)
1225+
webRTCSender?.createPeerConnection()
1226+
1227+
// Create Offer
1228+
webRTCSender?.createOffer { sdp ->
1229+
signalingClient?.sendOffer(sdp)
1230+
}
1231+
}
1232+
1233+
override fun onSDPAnswer(sdp: String) {
1234+
webRTCSender?.setRemoteAnswer(sdp)
1235+
}
1236+
1237+
override fun onICECandidate(candidate: String, sdpMid: String?, sdpMLineIndex: Int) {
1238+
webRTCSender?.addIceCandidate(candidate, sdpMid, sdpMLineIndex)
1239+
}
1240+
1241+
override fun onPeerDisconnected() {
1242+
viewModelScope.launch(Dispatchers.Main) {
1243+
_commandExecutionStatus.value = "Expert disconnected."
1244+
replaceAiMessageText("Expert disconnected.", isPending = false)
1245+
// Cleanup WebRTC but keep signaling for next task?
1246+
webRTCSender?.stop()
1247+
// Re-init sender for next time? For simpler logic, user should reconnect.
1248+
}
1249+
}
1250+
1251+
override fun onError(message: String) {
1252+
viewModelScope.launch(Dispatchers.Main) {
1253+
_uiState.value = PhotoReasoningUiState.Error("Signaling error: $message")
1254+
}
1255+
}
1256+
})
1257+
1258+
// Post the task immediately
1259+
Log.d(TAG, "Signaling initialized. Posting task.")
1260+
postTaskToHumanExpert(taskText)
1261+
}
1262+
1263+
private fun postTaskToHumanExpert(text: String) {
1264+
signalingClient?.postTask(text, hasScreenshot = false) // Capture live stream instead
1265+
}
1266+
1267+
private fun dispatchTap(x: Float, y: Float) {
1268+
Log.d(TAG, "Dispatching tap: ($x, $y)")
1269+
// Convert normalized to screen coordinates?
1270+
// Command.TapCoordinates usually expects absolute pixels.
1271+
// ScreenOperatorAccessibilityService.executeCommand handles logic.
1272+
// But wait, the web client sends normalized (0-1).
1273+
1274+
// We need the screen dimensions.
1275+
val displayMetrics = getApplication<Application>().resources.displayMetrics
1276+
val screenWidth = displayMetrics.widthPixels
1277+
val screenHeight = displayMetrics.heightPixels
1278+
1279+
val absX = (x * screenWidth).toInt()
1280+
val absY = (y * screenHeight).toInt()
1281+
1282+
val command = Command.TapCoordinates(absX.toString(), absY.toString())
1283+
ScreenOperatorAccessibilityService.executeCommand(command)
1284+
1285+
viewModelScope.launch(Dispatchers.Main) {
1286+
_commandExecutionStatus.value = "Expert tapped at ($absX, $absY)"
1287+
}
1288+
}
1289+
1290+
1291+
11031292
private fun finalizeAiMessage(finalText: String) {
11041293
Log.d(TAG, "finalizeAiMessage: Finalizing AI message.")
11051294
val messages = _chatState.getAllMessages().toMutableList()

0 commit comments

Comments
 (0)