diff --git a/.gitignore b/.gitignore index c7fa15fe..dd62c88f 100644 --- a/.gitignore +++ b/.gitignore @@ -53,9 +53,6 @@ captures/ # External native build folder generated in Android Studio 2.2 and later .externalNativeBuild/ -# Google Services (e.g. APIs or Firebase) -google-services.json - # Freeline freeline.py freeline/ diff --git a/app/google-services.json b/app/google-services.json new file mode 100644 index 00000000..a7f2fc52 --- /dev/null +++ b/app/google-services.json @@ -0,0 +1,29 @@ +{ + "project_info": { + "project_number": "123456789012", + "project_id": "screenoperator", + "storage_bucket": "screenoperator.appspot.com" + }, + "client": [ + { + "client_info": { + "mobilesdk_app_id": "1:123456789012:android:abcdef1234567890", + "android_client_info": { + "package_name": "io.github.android_poweruser" + } + }, + "oauth_client": [], + "api_key": [ + { + "current_key": "AIzaSyDUMMY_PLACEHOLDER_KEY" + } + ], + "services": { + "appinvite_service": { + "other_platform_oauth_client": [] + } + } + } + ], + "configuration_version": "1" +} diff --git a/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt b/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt index 7f0d0e8a..48f03ecd 100644 --- a/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt +++ b/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt @@ -389,6 +389,10 @@ class ScreenOperatorAccessibilityService : AccessibilityService() { openApp(command.packageName) } } + is Command.Retrieve -> { + Log.d(TAG, "Retrieve command is handled in prompt construction: ${command.heading}") + false + } is Command.WriteText -> { executeSyncCommandAction( logMessage = "Writing text: ${command.text}", diff --git a/app/src/main/kotlin/com/google/ai/sample/TrialTimerService.kt b/app/src/main/kotlin/com/google/ai/sample/TrialTimerService.kt index c7ff256c..94285104 100644 --- a/app/src/main/kotlin/com/google/ai/sample/TrialTimerService.kt +++ b/app/src/main/kotlin/com/google/ai/sample/TrialTimerService.kt @@ -228,4 +228,3 @@ class TrialTimerService : Service() { stopTimerLogic() } } - diff --git a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningDatabasePopup.kt b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningDatabasePopup.kt index a67bb7d6..710cf1c6 100644 --- a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningDatabasePopup.kt +++ b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningDatabasePopup.kt @@ -379,7 +379,7 @@ fun DatabaseListPopup( verticalAlignment = Alignment.CenterVertically, horizontalArrangement = Arrangement.End ) { - Text("This is also sent to the AI", color = Color.Black.copy(alpha = 0.6f), style = MaterialTheme.typography.bodyMedium, modifier = Modifier.weight(1f)) + Text("The headings are sent to the AI and the content is included on request", color = Color.Black.copy(alpha = 0.6f), style = MaterialTheme.typography.bodyMedium, modifier = Modifier.weight(1f)) Button(onClick = onNewClicked, colors = ButtonDefaults.buttonColors(containerColor = MaterialTheme.colorScheme.primary), modifier = Modifier.padding(start = 8.dp)) { Text("New") } @@ -419,7 +419,7 @@ fun DatabaseListPopup( } else { val entriesToExport = entries.filter { selectedEntryTitles.contains(it.title) } val jsonString = Json.encodeToString(ListSerializer(SystemMessageEntry.serializer()), entriesToExport) - shareTextFile(context, "system_messages_export.txt", jsonString) + shareTextFile(context, "Database.txt", jsonString) } selectionModeActive = false selectedEntryTitles = emptySet() diff --git a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningRoute.kt b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningRoute.kt index 80344aff..2c30defb 100644 --- a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningRoute.kt +++ b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningRoute.kt @@ -165,20 +165,13 @@ private suspend fun uriToBitmap( } } -private fun extractVideoFrame(context: android.content.Context, uri: Uri): Bitmap? { - val retriever = MediaMetadataRetriever() - return try { - retriever.setDataSource(context, uri) - retriever.getFrameAtTime(0) - } catch (e: IllegalArgumentException) { - android.util.Log.e("PhotoReasoningRoute", "Invalid video URI: $uri", e) - null private fun extractVideoFrame(context: android.content.Context, uri: Uri): Bitmap? { val retriever = MediaMetadataRetriever() return try { retriever.setDataSource(context, uri) retriever.getFrameAtTime(0, MediaMetadataRetriever.OPTION_CLOSEST_SYNC) } catch (e: Exception) { + android.util.Log.e("PhotoReasoningRoute", "Error extracting video frame for URI: $uri", e) null } finally { retriever.release() diff --git a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt index c386835a..5cc1d2ce 100644 --- a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt +++ b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt @@ -349,6 +349,7 @@ fun PhotoReasoningScreen( is Command.ClickButton -> "Click on button: \"${command.buttonText}\"" is Command.TapCoordinates -> "Tap coordinates: (${command.x}, ${command.y})" is Command.TakeScreenshot -> "Take screenshot" + is Command.Retrieve -> "Retrieve: \"${command.heading}\"" else -> command::class.simpleName ?: "Unknown Command" } Text("${index + 1}. $commandText", color = MaterialTheme.colorScheme.onTertiaryContainer) diff --git a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningTextPolicies.kt b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningTextPolicies.kt index 30fc466d..0ea216be 100644 --- a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningTextPolicies.kt +++ b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningTextPolicies.kt @@ -4,6 +4,14 @@ import android.content.Context import com.google.ai.sample.util.SystemMessageEntryPreferences internal object PhotoReasoningTextPolicies { + private const val RETRIEVAL_HEADER_PREFIX = "Retrieved information [" + + data class RetrievalResult( + val heading: String, + val content: String, + val available: Boolean + ) + fun buildPromptWithScreenInfo(userInput: String, screenInfoForPrompt: String?): String { return if (screenInfoForPrompt != null && screenInfoForPrompt.isNotBlank()) { "$userInput\n\n$screenInfoForPrompt" @@ -39,12 +47,45 @@ internal object PhotoReasoningTextPolicies { return "" } val builder = StringBuilder() - builder.append("Available System Guides:\n---\n") - for (entry in entries) { - builder.append("Title: ${entry.title}\n") - builder.append("Guide: ${entry.guide}\n") - builder.append("---\n") + builder.append("Retrievable information: ") + entries.forEach { entry -> + builder.append(entry.title).append(",\n") } return builder.toString() } + + fun resolveRetrievalRequest(context: Context, heading: String): RetrievalResult { + val normalizedHeading = heading.trim() + val entry = SystemMessageEntryPreferences.loadEntries(context).firstOrNull { + it.title.equals(normalizedHeading, ignoreCase = true) + } + return if (entry != null) { + RetrievalResult( + heading = entry.title, + content = entry.guide, + available = true + ) + } else { + RetrievalResult( + heading = normalizedHeading, + content = "The information is not available", + available = false + ) + } + } + + fun formatRetrievalResultForPrompt(result: RetrievalResult): String { + return if (result.available) { + "$RETRIEVAL_HEADER_PREFIX${result.heading}]:\n${result.content}" + } else { + "$RETRIEVAL_HEADER_PREFIX${result.heading}]:\nThe information is not available" + } + } + + fun isHeadingAlreadyRetrievedInChat(messages: List, heading: String): Boolean { + val marker = "$RETRIEVAL_HEADER_PREFIX$heading]" + return messages.any { message -> + message.text.contains(marker, ignoreCase = true) + } + } } diff --git a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt index 4a33a5a7..c6132cd5 100644 --- a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt +++ b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt @@ -202,6 +202,7 @@ class PhotoReasoningViewModel( private var currentRetryAttempt = 0 private var currentScreenInfoForPrompt: String? = null private var currentImageUrisForChat: List? = null + private var pendingRetrievedInfoForNextScreenshot: String? = null private val sseJson = PhotoReasoningSerialization.createStreamingJsonParser() private val openAiStreamParser = PhotoReasoningOpenAiStreamParser(sseJson) @@ -2264,6 +2265,11 @@ private fun processCommands(text: String) { val commandBatch = PhotoReasoningCommandProcessing.parseForFinalExecution(text) val commands = commandBatch.commands val hasTakeScreenshotCommand = commandBatch.hasTakeScreenshotCommand + val commandsToExecute = commands.filterNot { it is Command.Retrieve } + + if (hasTakeScreenshotCommand) { + pendingRetrievedInfoForNextScreenshot = buildRetrievedInfoForNextScreenshot(commands) + } if (commands.isNotEmpty()) { if (PhotoReasoningCommandExecutionGuard.shouldAbort(commandProcessingJob?.isActive == true, stopExecutionFlag.get())) return@launch @@ -2281,7 +2287,7 @@ private fun processCommands(text: String) { ) // Execute the commands - for (command in commands) { + for (command in commandsToExecute) { if (PhotoReasoningCommandExecutionGuard.shouldAbort(commandProcessingJob?.isActive == true, stopExecutionFlag.get())) { // Check for cancellation before executing each command Log.d(TAG, "Command execution stopped before executing: $command") _commandExecutionStatus.value = "Command execution stopped." @@ -2327,6 +2333,60 @@ private fun processCommands(text: String) { } } } + + private data class RetrievalCandidate( + val heading: String, + val includeUnavailableMessage: Boolean + ) + + private fun buildRetrievedInfoForNextScreenshot(commands: List): String? { + val requestedCandidates = mutableListOf() + commands.forEach { command -> + when (command) { + is Command.Retrieve -> requestedCandidates.add( + RetrievalCandidate( + heading = command.heading.trim(), + includeUnavailableMessage = true + ) + ) + is Command.OpenApp -> requestedCandidates.add( + RetrievalCandidate( + heading = command.packageName.trim(), + includeUnavailableMessage = false + ) + ) + else -> Unit + } + } + + if (requestedCandidates.isEmpty()) { + return null + } + + val parts = mutableListOf() + val usedHeadingsInThisBatch = mutableSetOf() + + requestedCandidates.forEach { candidate -> + if (candidate.heading.isBlank()) return@forEach + val resolved = PhotoReasoningTextPolicies.resolveRetrievalRequest(appContext, candidate.heading) + val duplicateInBatch = !usedHeadingsInThisBatch.add(resolved.heading.lowercase()) + val alreadyInChat = PhotoReasoningTextPolicies.isHeadingAlreadyRetrievedInChat( + messages = _chatState.getAllMessages(), + heading = resolved.heading + ) + if (!duplicateInBatch && !alreadyInChat) { + if (resolved.available || candidate.includeUnavailableMessage) { + parts.add(PhotoReasoningTextPolicies.formatRetrievalResultForPrompt(resolved)) + } + } + } + + if (parts.isEmpty()) { + return null + } + + return parts.joinToString(separator = "\n\n") + } private fun executeAccessibilityCommand(command: Command, shouldTrackCommand: Boolean) { ScreenOperatorAccessibilityService.executeCommand(command) if (shouldTrackCommand) { @@ -2483,6 +2543,8 @@ private fun processCommands(text: String) { context: Context, screenInfo: String? = null ) { + val enrichedScreenInfo = buildEnrichedScreenInfo(screenInfo) + if (screenshotUri == Uri.EMPTY) { // This case is for offline models, where we don't have a screenshot. // We just want to send the screen info. @@ -2490,7 +2552,7 @@ private fun processCommands(text: String) { reason( userInput = genericAnalysisPrompt, selectedImages = emptyList(), - screenInfoForPrompt = screenInfo, + screenInfoForPrompt = enrichedScreenInfo, imageUrisForChat = emptyList() ) return @@ -2540,7 +2602,7 @@ private fun processCommands(text: String) { reason( userInput = createGenericScreenshotPrompt(), selectedImages = listOf(bitmap), - screenInfoForPrompt = screenInfo, + screenInfoForPrompt = enrichedScreenInfo, imageUrisForChat = listOf(screenshotUri.toString()) ) } @@ -2567,6 +2629,18 @@ private fun processCommands(text: String) { } } + private fun buildEnrichedScreenInfo(screenInfo: String?): String? { + val retrievedInfo = pendingRetrievedInfoForNextScreenshot + pendingRetrievedInfoForNextScreenshot = null + + return when { + !retrievedInfo.isNullOrBlank() && !screenInfo.isNullOrBlank() -> "$retrievedInfo\n\n$screenInfo" + !retrievedInfo.isNullOrBlank() -> retrievedInfo + !screenInfo.isNullOrBlank() -> screenInfo + else -> null + } + } + private fun enqueueMistralAutoScreenshotRequest( bitmap: Bitmap, screenshotUri: String, diff --git a/app/src/main/kotlin/com/google/ai/sample/util/Command.kt b/app/src/main/kotlin/com/google/ai/sample/util/Command.kt index 049d4bf3..d8f56ed1 100644 --- a/app/src/main/kotlin/com/google/ai/sample/util/Command.kt +++ b/app/src/main/kotlin/com/google/ai/sample/util/Command.kt @@ -21,6 +21,7 @@ sealed class Command { data class ScrollLeftFromCoordinates(val x: String, val y: String, val distance: String, val duration: Long) : Command() data class ScrollRightFromCoordinates(val x: String, val y: String, val distance: String, val duration: Long) : Command() data class OpenApp(val packageName: String) : Command() + data class Retrieve(val heading: String) : Command() data class WriteText(val text: String) : Command() object UseHighReasoningModel : Command() object UseLowReasoningModel : Command() diff --git a/app/src/main/kotlin/com/google/ai/sample/util/CommandParser.kt b/app/src/main/kotlin/com/google/ai/sample/util/CommandParser.kt index 85978294..b86c6222 100644 --- a/app/src/main/kotlin/com/google/ai/sample/util/CommandParser.kt +++ b/app/src/main/kotlin/com/google/ai/sample/util/CommandParser.kt @@ -16,7 +16,7 @@ object CommandParser { SCROLL_DOWN_FROM_COORDINATES, SCROLL_UP_FROM_COORDINATES, SCROLL_LEFT_FROM_COORDINATES, SCROLL_RIGHT_FROM_COORDINATES, OPEN_APP, WRITE_TEXT, USE_HIGH_REASONING_MODEL, USE_LOW_REASONING_MODEL, - PRESS_ENTER_KEY + PRESS_ENTER_KEY, RETRIEVE } // Data class to hold pattern information @@ -81,7 +81,10 @@ object CommandParser { { match -> Command.ScrollRightFromCoordinates(match.groupValues[1], match.groupValues[2], match.groupValues[3], match.groupValues[4].toLong()) }, CommandTypeEnum.SCROLL_RIGHT_FROM_COORDINATES), // Open app patterns - PatternInfo("openApp1", Regex("(?i)\\bopenApp\\([\"']([^\"']+)[\"']\\)"), { match -> Command.OpenApp(match.groupValues[1]) }, CommandTypeEnum.OPEN_APP) + PatternInfo("openApp1", Regex("(?i)\\bopenApp\\([\"']([^\"']+)[\"']\\)"), { match -> Command.OpenApp(match.groupValues[1]) }, CommandTypeEnum.OPEN_APP), + + // Retrieve information patterns + PatternInfo("retrieve1", Regex("(?i)\\bretrieve\\([\"']([^\"']+)[\"']\\)"), { match -> Command.Retrieve(match.groupValues[1]) }, CommandTypeEnum.RETRIEVE) ) // Buffer for storing partial text between calls @@ -160,6 +163,7 @@ object CommandParser { is Command.ScrollLeftFromCoordinates -> Log.d(TAG, "Command details: ScrollLeftFromCoordinates(${command.x}, ${command.y}, ${command.distance}, ${command.duration})") is Command.ScrollRightFromCoordinates -> Log.d(TAG, "Command details: ScrollRightFromCoordinates(${command.x}, ${command.y}, ${command.distance}, ${command.duration})") is Command.OpenApp -> Log.d(TAG, "Command details: OpenApp(\"${command.packageName}\")") + is Command.Retrieve -> Log.d(TAG, "Command details: Retrieve(\"${command.heading}\")") is Command.WriteText -> Log.d(TAG, "Command details: WriteText(\"${command.text}\")") is Command.PressEnterKey -> Log.d(TAG, "Command details: PressEnterKey") } diff --git a/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt b/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt index d22db990..bc36874d 100644 --- a/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt +++ b/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt @@ -14,7 +14,7 @@ object SystemMessagePreferences { private const val KEY_FIRST_START_COMPLETED = "first_start_completed" // New flag // Content from pasted_content.txt - private const val DEFAULT_SYSTEM_MESSAGE_ON_FIRST_START = """You are on an App on a Smartphone. Your app is called Screen Operator. You start from this app. Proceed step by step! DON'T USE TOOL CODE! You must operate the screen with exactly following commands: "home()" "back()" "recentApps()" "openApp("sample")" for buttons and words: "click("sample")" "longClick("sample")" "tapAtCoordinates(x, y)" "tapAtCoordinates(x percent of screen%, y percent of screen%)" "scrollDown()" "scrollUp()" "scrollLeft()" "scrollRight()" "scrollDown(x, y, how much pixel to scroll, duration in milliseconds)" "scrollUp(x, y, how much pixel to scroll, duration in milliseconds)" "scrollLeft(x, y, how much pixel to scroll, duration in milliseconds)" "scrollRight(x, y, how much pixel to scroll, duration in milliseconds)" "scrollDown(x percent of screen%, y percent of screen%, how much percent to scroll%, duration in milliseconds)" "scrollUp(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollLeft(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollRight(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" scroll status bar down: "scrollUp(540, 0, 1100, 50)" "takeScreenshot()" To write text, search and click the textfield thereafter: "writeText("sample text")" You need to write the already existing text, if it should continue exist. If the keyboard is displayed, you can press "Enter()". Otherwise, you have to open the keyboard by clicking on the text field. Don't write the commands if you're just planing about it or messaging me. You can see the screen and get additional Informations about them with: "takeScreenshot()" You need this command at the end of every message until you are finish. When you're done don't say "takeScreenshot()" Your task is:""" + private const val DEFAULT_SYSTEM_MESSAGE_ON_FIRST_START = """You are on an App on a Smartphone. Your app is called Screen Operator. You start from this app. Proceed step by step! DON'T USE TOOL CODE! You must operate the screen with exactly following commands: "home()" "back()" "recentApps()" "openApp("sample")" for buttons and words: "click("sample")" "longClick("sample")" "tapAtCoordinates(x, y)" "tapAtCoordinates(x percent of screen%, y percent of screen%)" "scrollDown()" "scrollUp()" "scrollLeft()" "scrollRight()" "scrollDown(x, y, how much pixel to scroll, duration in milliseconds)" "scrollUp(x, y, how much pixel to scroll, duration in milliseconds)" "scrollLeft(x, y, how much pixel to scroll, duration in milliseconds)" "scrollRight(x, y, how much pixel to scroll, duration in milliseconds)" "scrollDown(x percent of screen%, y percent of screen%, how much percent to scroll%, duration in milliseconds)" "scrollUp(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollLeft(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollRight(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" scroll status bar down: "scrollUp(540, 0, 1100, 50)" "takeScreenshot()" To write text, search and click the textfield thereafter: "writeText("sample text")" You need to write the already existing text, if it should continue exist. If the keyboard is displayed, you can press "Enter()". Otherwise, you have to open the keyboard by clicking on the text field. Don't write the commands if you're just planing about it or messaging me. Retrieve information using "retrieve("sample")". You can see the screen and get additional Informations about them with: "takeScreenshot()" You need this command at the end of every message until you are finish. When you're done don't say "takeScreenshot()" Your task is:""" private fun prefs(context: Context) = context.getSharedPreferences(PREFS_NAME, Context.MODE_PRIVATE) /** diff --git a/app/src/test/java/com/google/ai/sample/util/CommandParserTest.kt b/app/src/test/java/com/google/ai/sample/util/CommandParserTest.kt index e019053a..f51db6b7 100644 --- a/app/src/test/java/com/google/ai/sample/util/CommandParserTest.kt +++ b/app/src/test/java/com/google/ai/sample/util/CommandParserTest.kt @@ -60,4 +60,11 @@ class CommandParserTest { assertEquals(1, commands.size) assertTrue(commands.first() is Command.PressEnterKey) } + + @Test + fun parseCommands_extractsRetrieveCommand() { + val commands = CommandParser.parseCommands("retrieve(\"Termux\")", clearBuffer = true) + assertEquals(1, commands.size) + assertTrue(commands.first() is Command.Retrieve) + } }