Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,6 @@ captures/
# External native build folder generated in Android Studio 2.2 and later
.externalNativeBuild/

# Google Services (e.g. APIs or Firebase)
google-services.json

# Freeline
freeline.py
freeline/
Expand Down
29 changes: 29 additions & 0 deletions app/google-services.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"project_info": {
"project_number": "123456789012",
"project_id": "screenoperator",
"storage_bucket": "screenoperator.appspot.com"
},
"client": [
{
"client_info": {
"mobilesdk_app_id": "1:123456789012:android:abcdef1234567890",
"android_client_info": {
"package_name": "io.github.android_poweruser"
}
},
"oauth_client": [],
"api_key": [
{
"current_key": "AIzaSyDUMMY_PLACEHOLDER_KEY"
}
],
"services": {
"appinvite_service": {
"other_platform_oauth_client": []
}
}
}
],
"configuration_version": "1"
}
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,10 @@ class ScreenOperatorAccessibilityService : AccessibilityService() {
openApp(command.packageName)
}
}
is Command.Retrieve -> {
Log.d(TAG, "Retrieve command is handled in prompt construction: ${command.heading}")
false
}
is Command.WriteText -> {
executeSyncCommandAction(
logMessage = "Writing text: ${command.text}",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,4 +228,3 @@ class TrialTimerService : Service() {
stopTimerLogic()
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ fun DatabaseListPopup(
verticalAlignment = Alignment.CenterVertically,
horizontalArrangement = Arrangement.End
) {
Text("This is also sent to the AI", color = Color.Black.copy(alpha = 0.6f), style = MaterialTheme.typography.bodyMedium, modifier = Modifier.weight(1f))
Text("The headings are sent to the AI and the content is included on request", color = Color.Black.copy(alpha = 0.6f), style = MaterialTheme.typography.bodyMedium, modifier = Modifier.weight(1f))
Button(onClick = onNewClicked, colors = ButtonDefaults.buttonColors(containerColor = MaterialTheme.colorScheme.primary), modifier = Modifier.padding(start = 8.dp)) {
Text("New")
}
Expand Down Expand Up @@ -419,7 +419,7 @@ fun DatabaseListPopup(
} else {
val entriesToExport = entries.filter { selectedEntryTitles.contains(it.title) }
val jsonString = Json.encodeToString(ListSerializer(SystemMessageEntry.serializer()), entriesToExport)
shareTextFile(context, "system_messages_export.txt", jsonString)
shareTextFile(context, "Database.txt", jsonString)
}
selectionModeActive = false
selectedEntryTitles = emptySet()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,20 +165,13 @@ private suspend fun uriToBitmap(
}
}

private fun extractVideoFrame(context: android.content.Context, uri: Uri): Bitmap? {
val retriever = MediaMetadataRetriever()
return try {
retriever.setDataSource(context, uri)
retriever.getFrameAtTime(0)
} catch (e: IllegalArgumentException) {
android.util.Log.e("PhotoReasoningRoute", "Invalid video URI: $uri", e)
null
private fun extractVideoFrame(context: android.content.Context, uri: Uri): Bitmap? {
val retriever = MediaMetadataRetriever()
return try {
retriever.setDataSource(context, uri)
retriever.getFrameAtTime(0, MediaMetadataRetriever.OPTION_CLOSEST_SYNC)
} catch (e: Exception) {
android.util.Log.e("PhotoReasoningRoute", "Error extracting video frame for URI: $uri", e)
null
} finally {
retriever.release()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,7 @@ fun PhotoReasoningScreen(
is Command.ClickButton -> "Click on button: \"${command.buttonText}\""
is Command.TapCoordinates -> "Tap coordinates: (${command.x}, ${command.y})"
is Command.TakeScreenshot -> "Take screenshot"
is Command.Retrieve -> "Retrieve: \"${command.heading}\""
else -> command::class.simpleName ?: "Unknown Command"
}
Text("${index + 1}. $commandText", color = MaterialTheme.colorScheme.onTertiaryContainer)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@ import android.content.Context
import com.google.ai.sample.util.SystemMessageEntryPreferences

internal object PhotoReasoningTextPolicies {
private const val RETRIEVAL_HEADER_PREFIX = "Retrieved information ["

data class RetrievalResult(
val heading: String,
val content: String,
val available: Boolean
)

fun buildPromptWithScreenInfo(userInput: String, screenInfoForPrompt: String?): String {
return if (screenInfoForPrompt != null && screenInfoForPrompt.isNotBlank()) {
"$userInput\n\n$screenInfoForPrompt"
Expand Down Expand Up @@ -39,12 +47,45 @@ internal object PhotoReasoningTextPolicies {
return ""
}
val builder = StringBuilder()
builder.append("Available System Guides:\n---\n")
for (entry in entries) {
builder.append("Title: ${entry.title}\n")
builder.append("Guide: ${entry.guide}\n")
builder.append("---\n")
builder.append("Retrievable information: ")
entries.forEach { entry ->
builder.append(entry.title).append(",\n")
}
return builder.toString()
}

fun resolveRetrievalRequest(context: Context, heading: String): RetrievalResult {
val normalizedHeading = heading.trim()
val entry = SystemMessageEntryPreferences.loadEntries(context).firstOrNull {
it.title.equals(normalizedHeading, ignoreCase = true)
}
return if (entry != null) {
RetrievalResult(
heading = entry.title,
content = entry.guide,
available = true
)
} else {
RetrievalResult(
heading = normalizedHeading,
content = "The information is not available",
available = false
)
}
}

fun formatRetrievalResultForPrompt(result: RetrievalResult): String {
return if (result.available) {
"$RETRIEVAL_HEADER_PREFIX${result.heading}]:\n${result.content}"
} else {
"$RETRIEVAL_HEADER_PREFIX${result.heading}]:\nThe information is not available"
}
}

fun isHeadingAlreadyRetrievedInChat(messages: List<PhotoReasoningMessage>, heading: String): Boolean {
val marker = "$RETRIEVAL_HEADER_PREFIX$heading]"
return messages.any { message ->
message.text.contains(marker, ignoreCase = true)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,7 @@ class PhotoReasoningViewModel(
private var currentRetryAttempt = 0
private var currentScreenInfoForPrompt: String? = null
private var currentImageUrisForChat: List<String>? = null
private var pendingRetrievedInfoForNextScreenshot: String? = null

private val sseJson = PhotoReasoningSerialization.createStreamingJsonParser()
private val openAiStreamParser = PhotoReasoningOpenAiStreamParser(sseJson)
Expand Down Expand Up @@ -2264,6 +2265,11 @@ private fun processCommands(text: String) {
val commandBatch = PhotoReasoningCommandProcessing.parseForFinalExecution(text)
val commands = commandBatch.commands
val hasTakeScreenshotCommand = commandBatch.hasTakeScreenshotCommand
val commandsToExecute = commands.filterNot { it is Command.Retrieve }

if (hasTakeScreenshotCommand) {
pendingRetrievedInfoForNextScreenshot = buildRetrievedInfoForNextScreenshot(commands)
}

if (commands.isNotEmpty()) {
if (PhotoReasoningCommandExecutionGuard.shouldAbort(commandProcessingJob?.isActive == true, stopExecutionFlag.get())) return@launch
Expand All @@ -2281,7 +2287,7 @@ private fun processCommands(text: String) {
)

// Execute the commands
for (command in commands) {
for (command in commandsToExecute) {
if (PhotoReasoningCommandExecutionGuard.shouldAbort(commandProcessingJob?.isActive == true, stopExecutionFlag.get())) { // Check for cancellation before executing each command
Log.d(TAG, "Command execution stopped before executing: $command")
_commandExecutionStatus.value = "Command execution stopped."
Expand Down Expand Up @@ -2327,6 +2333,60 @@ private fun processCommands(text: String) {
}
}
}

private data class RetrievalCandidate(
val heading: String,
val includeUnavailableMessage: Boolean
)

private fun buildRetrievedInfoForNextScreenshot(commands: List<Command>): String? {
val requestedCandidates = mutableListOf<RetrievalCandidate>()
commands.forEach { command ->
when (command) {
is Command.Retrieve -> requestedCandidates.add(
RetrievalCandidate(
heading = command.heading.trim(),
includeUnavailableMessage = true
)
)
is Command.OpenApp -> requestedCandidates.add(
RetrievalCandidate(
heading = command.packageName.trim(),
includeUnavailableMessage = false
)
)
else -> Unit
}
}

if (requestedCandidates.isEmpty()) {
return null
}

val parts = mutableListOf<String>()
val usedHeadingsInThisBatch = mutableSetOf<String>()

requestedCandidates.forEach { candidate ->
if (candidate.heading.isBlank()) return@forEach
val resolved = PhotoReasoningTextPolicies.resolveRetrievalRequest(appContext, candidate.heading)
val duplicateInBatch = !usedHeadingsInThisBatch.add(resolved.heading.lowercase())
val alreadyInChat = PhotoReasoningTextPolicies.isHeadingAlreadyRetrievedInChat(
messages = _chatState.getAllMessages(),
heading = resolved.heading
)
if (!duplicateInBatch && !alreadyInChat) {
if (resolved.available || candidate.includeUnavailableMessage) {
parts.add(PhotoReasoningTextPolicies.formatRetrievalResultForPrompt(resolved))
}
}
}

if (parts.isEmpty()) {
return null
}

return parts.joinToString(separator = "\n\n")
}
private fun executeAccessibilityCommand(command: Command, shouldTrackCommand: Boolean) {
ScreenOperatorAccessibilityService.executeCommand(command)
if (shouldTrackCommand) {
Expand Down Expand Up @@ -2483,14 +2543,16 @@ private fun processCommands(text: String) {
context: Context,
screenInfo: String? = null
) {
val enrichedScreenInfo = buildEnrichedScreenInfo(screenInfo)

if (screenshotUri == Uri.EMPTY) {
// This case is for offline models, where we don't have a screenshot.
// We just want to send the screen info.
val genericAnalysisPrompt = createGenericScreenshotPrompt()
reason(
userInput = genericAnalysisPrompt,
selectedImages = emptyList(),
screenInfoForPrompt = screenInfo,
screenInfoForPrompt = enrichedScreenInfo,
imageUrisForChat = emptyList()
)
return
Expand Down Expand Up @@ -2540,7 +2602,7 @@ private fun processCommands(text: String) {
reason(
userInput = createGenericScreenshotPrompt(),
selectedImages = listOf(bitmap),
screenInfoForPrompt = screenInfo,
screenInfoForPrompt = enrichedScreenInfo,
imageUrisForChat = listOf(screenshotUri.toString())
)
}
Expand All @@ -2567,6 +2629,18 @@ private fun processCommands(text: String) {
}
}

private fun buildEnrichedScreenInfo(screenInfo: String?): String? {
val retrievedInfo = pendingRetrievedInfoForNextScreenshot
pendingRetrievedInfoForNextScreenshot = null

return when {
!retrievedInfo.isNullOrBlank() && !screenInfo.isNullOrBlank() -> "$retrievedInfo\n\n$screenInfo"
!retrievedInfo.isNullOrBlank() -> retrievedInfo
!screenInfo.isNullOrBlank() -> screenInfo
else -> null
}
}

private fun enqueueMistralAutoScreenshotRequest(
bitmap: Bitmap,
screenshotUri: String,
Expand Down
1 change: 1 addition & 0 deletions app/src/main/kotlin/com/google/ai/sample/util/Command.kt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ sealed class Command {
data class ScrollLeftFromCoordinates(val x: String, val y: String, val distance: String, val duration: Long) : Command()
data class ScrollRightFromCoordinates(val x: String, val y: String, val distance: String, val duration: Long) : Command()
data class OpenApp(val packageName: String) : Command()
data class Retrieve(val heading: String) : Command()
data class WriteText(val text: String) : Command()
object UseHighReasoningModel : Command()
object UseLowReasoningModel : Command()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ object CommandParser {
SCROLL_DOWN_FROM_COORDINATES, SCROLL_UP_FROM_COORDINATES,
SCROLL_LEFT_FROM_COORDINATES, SCROLL_RIGHT_FROM_COORDINATES,
OPEN_APP, WRITE_TEXT, USE_HIGH_REASONING_MODEL, USE_LOW_REASONING_MODEL,
PRESS_ENTER_KEY
PRESS_ENTER_KEY, RETRIEVE
}

// Data class to hold pattern information
Expand Down Expand Up @@ -81,7 +81,10 @@ object CommandParser {
{ match -> Command.ScrollRightFromCoordinates(match.groupValues[1], match.groupValues[2], match.groupValues[3], match.groupValues[4].toLong()) }, CommandTypeEnum.SCROLL_RIGHT_FROM_COORDINATES),

// Open app patterns
PatternInfo("openApp1", Regex("(?i)\\bopenApp\\([\"']([^\"']+)[\"']\\)"), { match -> Command.OpenApp(match.groupValues[1]) }, CommandTypeEnum.OPEN_APP)
PatternInfo("openApp1", Regex("(?i)\\bopenApp\\([\"']([^\"']+)[\"']\\)"), { match -> Command.OpenApp(match.groupValues[1]) }, CommandTypeEnum.OPEN_APP),

// Retrieve information patterns
PatternInfo("retrieve1", Regex("(?i)\\bretrieve\\([\"']([^\"']+)[\"']\\)"), { match -> Command.Retrieve(match.groupValues[1]) }, CommandTypeEnum.RETRIEVE)
)

// Buffer for storing partial text between calls
Expand Down Expand Up @@ -160,6 +163,7 @@ object CommandParser {
is Command.ScrollLeftFromCoordinates -> Log.d(TAG, "Command details: ScrollLeftFromCoordinates(${command.x}, ${command.y}, ${command.distance}, ${command.duration})")
is Command.ScrollRightFromCoordinates -> Log.d(TAG, "Command details: ScrollRightFromCoordinates(${command.x}, ${command.y}, ${command.distance}, ${command.duration})")
is Command.OpenApp -> Log.d(TAG, "Command details: OpenApp(\"${command.packageName}\")")
is Command.Retrieve -> Log.d(TAG, "Command details: Retrieve(\"${command.heading}\")")
is Command.WriteText -> Log.d(TAG, "Command details: WriteText(\"${command.text}\")")
is Command.PressEnterKey -> Log.d(TAG, "Command details: PressEnterKey")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ object SystemMessagePreferences {
private const val KEY_FIRST_START_COMPLETED = "first_start_completed" // New flag

// Content from pasted_content.txt
private const val DEFAULT_SYSTEM_MESSAGE_ON_FIRST_START = """You are on an App on a Smartphone. Your app is called Screen Operator. You start from this app. Proceed step by step! DON'T USE TOOL CODE! You must operate the screen with exactly following commands: "home()" "back()" "recentApps()" "openApp("sample")" for buttons and words: "click("sample")" "longClick("sample")" "tapAtCoordinates(x, y)" "tapAtCoordinates(x percent of screen%, y percent of screen%)" "scrollDown()" "scrollUp()" "scrollLeft()" "scrollRight()" "scrollDown(x, y, how much pixel to scroll, duration in milliseconds)" "scrollUp(x, y, how much pixel to scroll, duration in milliseconds)" "scrollLeft(x, y, how much pixel to scroll, duration in milliseconds)" "scrollRight(x, y, how much pixel to scroll, duration in milliseconds)" "scrollDown(x percent of screen%, y percent of screen%, how much percent to scroll%, duration in milliseconds)" "scrollUp(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollLeft(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollRight(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" scroll status bar down: "scrollUp(540, 0, 1100, 50)" "takeScreenshot()" To write text, search and click the textfield thereafter: "writeText("sample text")" You need to write the already existing text, if it should continue exist. If the keyboard is displayed, you can press "Enter()". Otherwise, you have to open the keyboard by clicking on the text field. Don't write the commands if you're just planing about it or messaging me. You can see the screen and get additional Informations about them with: "takeScreenshot()" You need this command at the end of every message until you are finish. When you're done don't say "takeScreenshot()" Your task is:"""
private const val DEFAULT_SYSTEM_MESSAGE_ON_FIRST_START = """You are on an App on a Smartphone. Your app is called Screen Operator. You start from this app. Proceed step by step! DON'T USE TOOL CODE! You must operate the screen with exactly following commands: "home()" "back()" "recentApps()" "openApp("sample")" for buttons and words: "click("sample")" "longClick("sample")" "tapAtCoordinates(x, y)" "tapAtCoordinates(x percent of screen%, y percent of screen%)" "scrollDown()" "scrollUp()" "scrollLeft()" "scrollRight()" "scrollDown(x, y, how much pixel to scroll, duration in milliseconds)" "scrollUp(x, y, how much pixel to scroll, duration in milliseconds)" "scrollLeft(x, y, how much pixel to scroll, duration in milliseconds)" "scrollRight(x, y, how much pixel to scroll, duration in milliseconds)" "scrollDown(x percent of screen%, y percent of screen%, how much percent to scroll%, duration in milliseconds)" "scrollUp(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollLeft(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollRight(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" scroll status bar down: "scrollUp(540, 0, 1100, 50)" "takeScreenshot()" To write text, search and click the textfield thereafter: "writeText("sample text")" You need to write the already existing text, if it should continue exist. If the keyboard is displayed, you can press "Enter()". Otherwise, you have to open the keyboard by clicking on the text field. Don't write the commands if you're just planing about it or messaging me. Retrieve information using "retrieve("sample")". You can see the screen and get additional Informations about them with: "takeScreenshot()" You need this command at the end of every message until you are finish. When you're done don't say "takeScreenshot()" Your task is:"""
private fun prefs(context: Context) = context.getSharedPreferences(PREFS_NAME, Context.MODE_PRIVATE)

/**
Expand Down
Loading
Loading