@@ -72,6 +72,9 @@ import kotlinx.serialization.json.Json
7272import kotlinx.serialization.json.JsonElement
7373import kotlinx.serialization.json.jsonObject
7474import kotlinx.serialization.json.jsonPrimitive
75+ import com.google.ai.sample.webrtc.WebRTCSender
76+ import com.google.ai.sample.webrtc.SignalingClient
77+ import org.webrtc.IceCandidate
7578
7679class PhotoReasoningViewModel (
7780 application : Application ,
@@ -85,6 +88,14 @@ class PhotoReasoningViewModel(
8588
8689 private var llmInference: LlmInference ? = null
8790 private val TAG = " PhotoReasoningViewModel"
91+
92+ // WebRTC & Signaling
93+ private var webRTCSender: WebRTCSender ? = null
94+ private var signalingClient: SignalingClient ? = null
95+ private var lastMediaProjectionResultCode: Int = 0
96+ private var lastMediaProjectionResultData: Intent ? = null
97+
98+
8899
89100 private fun Bitmap.toBase64 (): String {
90101 val outputStream = ByteArrayOutputStream ()
@@ -351,6 +362,10 @@ class PhotoReasoningViewModel(
351362 Log .w(TAG , " Error closing LlmInference" , e)
352363 }
353364 llmInference = null
365+
366+ // WebRTC cleanup
367+ webRTCSender?.stop()
368+ signalingClient?.disconnect()
354369 }
355370
356371 private fun createChatWithSystemMessage (context : Context ? = null): Chat {
@@ -509,8 +524,36 @@ class PhotoReasoningViewModel(
509524
510525 // Check for Human Expert model
511526 if (currentModel == ModelOption .HUMAN_EXPERT ) {
512- _uiState .value = PhotoReasoningUiState .Error (" Human Expert mode is not yet connected. The Human Operator app is required." )
513- return
527+ // If we already have a specialized session running, maybe just send the text?
528+ // For now, we assume the user hits "Send" to trigger the connection + task post.
529+
530+ // Initial task post message
531+ val userMessage = PhotoReasoningMessage (
532+ text = userInput,
533+ participant = PhotoParticipant .USER ,
534+ imageUris = imageUrisForChat ? : emptyList(),
535+ isPending = false
536+ )
537+ _chatState .addMessage(userMessage)
538+
539+ _uiState .value = PhotoReasoningUiState .Loading
540+
541+ // We need to ensure we have MediaProjection permission.
542+ // The UI (PhotoReasoningScreen) calls requestMediaProjectionPermission before calling reason()
543+ // if permission is missing. So here we should ideally rely on onMediaProjectionPermissionGranted
544+ // having been called or already having the intent.
545+
546+ // But valid intent handling happens in onMediaProjectionPermissionGranted.
547+ // If reason() is called, it means we likely have permission or it was just granted.
548+
549+ // Check if we are already connected?
550+ if (signalingClient == null ) {
551+ startHumanExpertSession(userInput)
552+ } else {
553+ // Already connected, just post the new task text or send via DataChannel if paired
554+ postTaskToHumanExpert(userInput)
555+ }
556+ return
514557 }
515558
516559 // Check for offline model (Gemma)
@@ -1097,9 +1140,155 @@ class PhotoReasoningViewModel(
10971140 }
10981141 }
10991142
1100- /* *
1101- * Update the AI message in chat history
1102- */
1143+ // === Human Expert / WebRTC Logic ===
1144+
1145+ fun onMediaProjectionPermissionGranted (resultCode : Int , data : Intent ) {
1146+ Log .d(TAG , " onMediaProjectionPermissionGranted: Storing result. Code=$resultCode " )
1147+ lastMediaProjectionResultCode = resultCode
1148+ lastMediaProjectionResultData = data
1149+
1150+ // If we were waiting to start a session, we could start it here.
1151+ // For now, if the user just clicked "Human Expert" and granted permission,
1152+ // they might expect the connection to start.
1153+ // But startHumanExpertSession is already called in reason() if permission was already there.
1154+ // If permission wasn't there, reason() wasn't called (MainActivity blocked it?).
1155+ // Actually MainActivity.requestMediaProjectionPermission callback invokes the lambda passed to it.
1156+ // That lambda calls reason(). So reason() will be called immediately after this.
1157+ }
1158+
1159+ private fun startHumanExpertSession (taskText : String ) {
1160+ if (lastMediaProjectionResultData == null ) {
1161+ _uiState .value = PhotoReasoningUiState .Error (" Screen capture permission required." )
1162+ return
1163+ }
1164+
1165+ if (signalingClient != null ) {
1166+ // Already connected
1167+ postTaskToHumanExpert(taskText)
1168+ return
1169+ }
1170+
1171+ _uiState .value = PhotoReasoningUiState .Loading
1172+ _chatState .addMessage(PhotoReasoningMessage (text = " Connecting to Human Expert network..." , participant = PhotoParticipant .MODEL , isPending = true ))
1173+ _chatMessagesFlow .value = _chatState .getAllMessages()
1174+
1175+ // Initialize WebRTC Sender
1176+ webRTCSender = WebRTCSender (getApplication(), object : WebRTCSender .WebRTCSenderListener {
1177+ override fun onLocalICECandidate (candidate : IceCandidate ) {
1178+ signalingClient?.sendICECandidate(candidate.sdp, candidate.sdpMid, candidate.sdpMLineIndex)
1179+ }
1180+
1181+ override fun onConnectionStateChanged (state : String ) {
1182+ Log .d(TAG , " WebRTC State: $state " )
1183+ viewModelScope.launch(Dispatchers .Main ) {
1184+ if (state == " CONNECTED" ) {
1185+ _commandExecutionStatus .value = " Expert connected. Sharing screen."
1186+ replaceAiMessageText(" Expert connected! They can now see your screen and control your device." , isPending = false )
1187+ } else if (state == " DISCONNECTED" || state == " FAILED" ) {
1188+ _commandExecutionStatus .value = " Expert disconnected."
1189+ // Keep the chat message as is
1190+ }
1191+ }
1192+ }
1193+
1194+ override fun onTapReceived (x : Float , y : Float ) {
1195+ dispatchTap(x, y)
1196+ }
1197+
1198+ override fun onError (message : String ) {
1199+ Log .e(TAG , " WebRTC Error: $message " )
1200+ viewModelScope.launch(Dispatchers .Main ) {
1201+ _uiState .value = PhotoReasoningUiState .Error (" Video stream error: $message " )
1202+ }
1203+ }
1204+ })
1205+ webRTCSender?.initialize()
1206+
1207+ // Initialize Signaling
1208+ // Initialize Signaling
1209+ signalingClient = SignalingClient (object : SignalingClient .SignalingListener {
1210+ override fun onTaskPosted (taskId : String ) {
1211+ viewModelScope.launch(Dispatchers .Main ) {
1212+ val msg = " Task posted. Waiting for an expert to claim it..."
1213+ replaceAiMessageText(msg, isPending = true )
1214+ }
1215+ }
1216+
1217+ override fun onTaskClaimed (taskId : String ) {
1218+ Log .d(TAG , " Task claimed! Starting WebRTC negotiation." )
1219+ viewModelScope.launch(Dispatchers .Main ) {
1220+ replaceAiMessageText(" Expert found! Establishing video connection..." , isPending = true )
1221+ }
1222+
1223+ // Start screen capture
1224+ webRTCSender?.startScreenCapture(lastMediaProjectionResultData!! )
1225+ webRTCSender?.createPeerConnection()
1226+
1227+ // Create Offer
1228+ webRTCSender?.createOffer { sdp ->
1229+ signalingClient?.sendOffer(sdp)
1230+ }
1231+ }
1232+
1233+ override fun onSDPAnswer (sdp : String ) {
1234+ webRTCSender?.setRemoteAnswer(sdp)
1235+ }
1236+
1237+ override fun onICECandidate (candidate : String , sdpMid : String? , sdpMLineIndex : Int ) {
1238+ webRTCSender?.addIceCandidate(candidate, sdpMid, sdpMLineIndex)
1239+ }
1240+
1241+ override fun onPeerDisconnected () {
1242+ viewModelScope.launch(Dispatchers .Main ) {
1243+ _commandExecutionStatus .value = " Expert disconnected."
1244+ replaceAiMessageText(" Expert disconnected." , isPending = false )
1245+ // Cleanup WebRTC but keep signaling for next task?
1246+ webRTCSender?.stop()
1247+ // Re-init sender for next time? For simpler logic, user should reconnect.
1248+ }
1249+ }
1250+
1251+ override fun onError (message : String ) {
1252+ viewModelScope.launch(Dispatchers .Main ) {
1253+ _uiState .value = PhotoReasoningUiState .Error (" Signaling error: $message " )
1254+ }
1255+ }
1256+ })
1257+
1258+ // Post the task immediately
1259+ Log .d(TAG , " Signaling initialized. Posting task." )
1260+ postTaskToHumanExpert(taskText)
1261+ }
1262+
1263+ private fun postTaskToHumanExpert (text : String ) {
1264+ signalingClient?.postTask(text, hasScreenshot = false ) // Capture live stream instead
1265+ }
1266+
1267+ private fun dispatchTap (x : Float , y : Float ) {
1268+ Log .d(TAG , " Dispatching tap: ($x , $y )" )
1269+ // Convert normalized to screen coordinates?
1270+ // Command.TapCoordinates usually expects absolute pixels.
1271+ // ScreenOperatorAccessibilityService.executeCommand handles logic.
1272+ // But wait, the web client sends normalized (0-1).
1273+
1274+ // We need the screen dimensions.
1275+ val displayMetrics = getApplication<Application >().resources.displayMetrics
1276+ val screenWidth = displayMetrics.widthPixels
1277+ val screenHeight = displayMetrics.heightPixels
1278+
1279+ val absX = (x * screenWidth).toInt()
1280+ val absY = (y * screenHeight).toInt()
1281+
1282+ val command = Command .TapCoordinates (absX.toString(), absY.toString())
1283+ ScreenOperatorAccessibilityService .executeCommand(command)
1284+
1285+ viewModelScope.launch(Dispatchers .Main ) {
1286+ _commandExecutionStatus .value = " Expert tapped at ($absX , $absY )"
1287+ }
1288+ }
1289+
1290+
1291+
11031292 private fun finalizeAiMessage (finalText : String ) {
11041293 Log .d(TAG , " finalizeAiMessage: Finalizing AI message." )
11051294 val messages = _chatState .getAllMessages().toMutableList()
0 commit comments