Skip to content

Commit 6644379

Browse files
Cumulative fix for screenshot loop, AI interaction, and UX refinements.
This commit incorporates a series of changes to address issues related to screenshot functionality: 1. **Resolved Screenshot Loop & Stability:** - Implemented a flag-controlled `ImageReader.OnImageAvailableListener` in `ScreenCaptureService.kt`. `takeScreenshot()` now sets an `AtomicBoolean` flag, and the listener consumes this flag to process and save an image only once per explicit request. Unexpected frames are discarded. This is the primary fix for the ~1-second screenshot loop, `ImageReader_JNI` errors, and multiple broadcasts for a single capture event. - Added debouncing logic to `PhotoReasoningViewModel.addScreenshotToConversation()` to prevent rapid/duplicate processing of the same screenshot URI, further stabilizing the system and preventing AI flooding/chat spam. 2. **Ensured AI Responds to Image Inputs:** - The call to `reason()` within `addScreenshotToConversation()` is active (after debouncer check). - `PhotoReasoningViewModel.reason()` was enhanced to use `currentSelectedImages` (the last programmatically taken screenshot) if you send a text prompt without explicitly attaching new images, allowing you to follow-up on AI-initiated screenshots. `currentSelectedImages` is cleared after use in this context. 3. **Restored Startup MediaProjection Permission Request:** - The call to `requestMediaProjectionPermission()` in `MainActivity.onCreate()` is active. - An `isProcessingExplicitScreenshotRequest` flag in `MainActivity` ensures this startup permission grant doesn't automatically start `ScreenCaptureService`, while explicit requests (from AccessibilityService) do. 4. **Minimized Screenshot Command Delay:** - Reduced the command processing delay for `Command.TakeScreenshot` in `ScreenOperatorAccessibilityService.scheduleNextCommandProcessing()` from 850ms to 50ms for better responsiveness. 5. **Consolidated Screenshot Information for AI:** - `PhotoReasoningViewModel.reason()` now accepts an optional `screenInfoForPrompt` parameter. - `addScreenshotToConversation()` calls `reason()` with a generic prompt (e.g., "Analyze the provided screenshot and its context.") and passes the actual `screenInfo` (from Accessibility Service) to this new parameter. - `reason()` combines the generic prompt and `screenInfoForPrompt` into a single text block used for both the AI query and your message in the chat history, avoiding redundancy. 6. **Reverted `CommandParser.kt` Changes:** - Restored broader, natural-language regex for `TAKE_SCREENSHOT` commands as per your request. These changes collectively aim to provide a stable, responsive, and logically sound screenshotting and AI interaction experience.
1 parent 5794d8c commit 6644379

4 files changed

Lines changed: 94 additions & 70 deletions

File tree

app/src/main/kotlin/com/google/ai/sample/MainActivity.kt

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ class MainActivity : ComponentActivity() {
121121
private var currentScreenInfoForScreenshot: String? = null
122122

123123
private lateinit var navController: NavHostController
124+
private var isProcessingExplicitScreenshotRequest: Boolean = false
124125

125126
private val screenshotRequestHandler = object : BroadcastReceiver() {
126127
override fun onReceive(context: Context?, intent: Intent?) {
@@ -134,6 +135,7 @@ class MainActivity : ComponentActivity() {
134135
this@MainActivity.takeAdditionalScreenshot()
135136
} else {
136137
Log.d(TAG, "ScreenCaptureService not running. Calling requestMediaProjectionPermission() to start it.")
138+
this@MainActivity.isProcessingExplicitScreenshotRequest = true
137139
this@MainActivity.requestMediaProjectionPermission()
138140
}
139141
}
@@ -460,20 +462,27 @@ class MainActivity : ComponentActivity() {
460462
) { result ->
461463
Log.d(TAG, "MediaProjection result: resultCode=${result.resultCode}, hasData=${result.data != null}")
462464
if (result.resultCode == Activity.RESULT_OK && result.data != null) {
463-
Log.i(TAG, "MediaProjection permission granted, starting ScreenCaptureService with ACTION_START_CAPTURE")
464-
val serviceIntent = Intent(this, ScreenCaptureService::class.java).apply {
465-
action = ScreenCaptureService.ACTION_START_CAPTURE // Ensure this action
466-
putExtra(ScreenCaptureService.EXTRA_RESULT_CODE, result.resultCode)
467-
putExtra(ScreenCaptureService.EXTRA_RESULT_DATA, result.data!!)
468-
}
469-
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
470-
startForegroundService(serviceIntent)
471-
} else {
472-
startService(serviceIntent)
465+
if (this@MainActivity.isProcessingExplicitScreenshotRequest) {
466+
Log.i(TAG, "MediaProjection permission granted (explicit request), starting ScreenCaptureService with ACTION_START_CAPTURE")
467+
val serviceIntent = Intent(this, ScreenCaptureService::class.java).apply {
468+
action = ScreenCaptureService.ACTION_START_CAPTURE // Ensure this action
469+
putExtra(ScreenCaptureService.EXTRA_RESULT_CODE, result.resultCode)
470+
putExtra(ScreenCaptureService.EXTRA_RESULT_DATA, result.data!!)
471+
}
472+
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.O) {
473+
startForegroundService(serviceIntent)
474+
} else {
475+
startService(serviceIntent)
476+
}
473477
}
478+
this@MainActivity.isProcessingExplicitScreenshotRequest = false
474479
} else {
475480
Log.w(TAG, "MediaProjection permission denied or cancelled by user.")
476481
Toast.makeText(this, "Screen capture permission denied", Toast.LENGTH_SHORT).show()
482+
// Also reset the flag if permission is denied for an explicit request
483+
if (this@MainActivity.isProcessingExplicitScreenshotRequest) {
484+
this@MainActivity.isProcessingExplicitScreenshotRequest = false
485+
}
477486
}
478487
}
479488

app/src/main/kotlin/com/google/ai/sample/ScreenCaptureService.kt

Lines changed: 45 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class ScreenCaptureService : Service() {
5252
private var virtualDisplay: VirtualDisplay? = null
5353
private var imageReader: ImageReader? = null
5454
private var isReady = false // Flag to indicate if MediaProjection is set up and active
55+
private val isScreenshotRequestedRef = java.util.concurrent.atomic.AtomicBoolean(false)
5556

5657
// Callback for MediaProjection
5758
private val mediaProjectionCallback = object : MediaProjection.Callback() {
@@ -192,7 +193,8 @@ private fun takeScreenshot() {
192193
Log.e(TAG, "Cannot take screenshot - service not ready or mediaProjection is null. isReady=$isReady, mediaProjectionIsNull=${mediaProjection == null}")
193194
return
194195
}
195-
Log.d(TAG, "takeScreenshot: Preparing to capture.")
196+
isScreenshotRequestedRef.set(true)
197+
Log.d(TAG, "takeScreenshot: Preparing to capture. isScreenshotRequestedRef set to true.")
196198

197199
try {
198200
// Check if we need to initialize VirtualDisplay and ImageReader
@@ -235,34 +237,49 @@ private fun takeScreenshot() {
235237
}
236238

237239
localImageReader.setOnImageAvailableListener({ reader ->
238-
var image: android.media.Image? = null
239-
try {
240-
image = reader.acquireLatestImage()
241-
if (image != null) {
242-
val planes = image.planes
243-
val buffer = planes[0].buffer
244-
val pixelStride = planes[0].pixelStride
245-
val rowStride = planes[0].rowStride
246-
val rowPadding = rowStride - pixelStride * width
247-
248-
val bitmap = Bitmap.createBitmap(
249-
width + rowPadding / pixelStride,
250-
height,
251-
Bitmap.Config.ARGB_8888
252-
)
253-
bitmap.copyPixelsFromBuffer(buffer)
254-
Log.d(TAG, "Bitmap created, proceeding to save.")
255-
saveScreenshot(bitmap)
256-
} else {
257-
Log.w(TAG, "acquireLatestImage returned null.")
240+
if (isScreenshotRequestedRef.compareAndSet(true, false)) {
241+
Log.d(TAG, "Screenshot request flag consumed, processing image.")
242+
var image: android.media.Image? = null
243+
try {
244+
image = reader.acquireLatestImage()
245+
if (image != null) {
246+
val planes = image.planes
247+
val buffer = planes[0].buffer
248+
val pixelStride = planes[0].pixelStride
249+
val rowStride = planes[0].rowStride
250+
val rowPadding = rowStride - pixelStride * width
251+
252+
val bitmap = Bitmap.createBitmap(
253+
width + rowPadding / pixelStride,
254+
height,
255+
Bitmap.Config.ARGB_8888
256+
)
257+
bitmap.copyPixelsFromBuffer(buffer)
258+
Log.d(TAG, "Bitmap created, proceeding to save.")
259+
saveScreenshot(bitmap)
260+
} else {
261+
Log.w(TAG, "acquireLatestImage returned null despite requested flag.")
262+
}
263+
} catch (e: Exception) {
264+
Log.e(TAG, "Error processing image in listener", e)
265+
} finally {
266+
image?.close()
267+
// Do NOT release VirtualDisplay or ImageReader here
268+
// They will be reused for the next screenshot
269+
Log.d(TAG, "Screenshot processed (or attempted), keeping resources for reuse.")
270+
}
271+
} else {
272+
// Logic to discard the frame if no screenshot was formally requested
273+
Log.w(TAG, "OnImageAvailableListener invoked but no screenshot was requested or flag already consumed. Discarding frame.")
274+
var imageToDiscard: android.media.Image? = null
275+
try {
276+
imageToDiscard = reader.acquireLatestImage()
277+
} catch (e: Exception) {
278+
// This catch is important because acquireLatestImage can fail if buffers are truly messed up
279+
Log.e(TAG, "Error acquiring image to discard in OnImageAvailableListener else block", e)
280+
} finally {
281+
imageToDiscard?.close()
258282
}
259-
} catch (e: Exception) {
260-
Log.e(TAG, "Error processing image", e)
261-
} finally {
262-
image?.close()
263-
// Do NOT release VirtualDisplay or ImageReader here
264-
// They will be reused for the next screenshot
265-
Log.d(TAG, "Screenshot captured, keeping resources for reuse.")
266283
}
267284
}, Handler(Looper.getMainLooper()))
268285

app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,8 @@ class ScreenOperatorAccessibilityService : AccessibilityService() {
180180

181181
private fun scheduleNextCommandProcessing() {
182182
val nextCommandDelay = if (commandQueue.peek() is Command.TakeScreenshot) {
183-
Log.d(TAG, "Next command in queue is TakeScreenshot, scheduling with 850ms delay.")
184-
850L
183+
Log.d(TAG, "Next command in queue is TakeScreenshot, scheduling with 50ms delay.")
184+
50L
185185
} else {
186186
500L
187187
}

app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt

Lines changed: 28 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ class PhotoReasoningViewModel(
5959

6060
// Keep track of the latest screenshot URI
6161
private var latestScreenshotUri: Uri? = null
62+
private var lastProcessedScreenshotUri: Uri? = null
63+
private var lastProcessedScreenshotTime: Long = 0L
6264

6365
// Keep track of the current selected images
6466
private var currentSelectedImages: List<Bitmap> = emptyList()
@@ -104,19 +106,26 @@ class PhotoReasoningViewModel(
104106

105107
fun reason(
106108
userInput: String,
107-
selectedImages: List<Bitmap>
109+
selectedImages: List<Bitmap>,
110+
screenInfoForPrompt: String? = null
108111
) {
109-
Log.d(TAG, "reason() called. User input: '$userInput', Image count: ${selectedImages.size}")
112+
Log.d(TAG, "reason() called. User input: '$userInput', Image count: ${selectedImages.size}, ScreenInfo: ${screenInfoForPrompt != null}")
110113
_uiState.value = PhotoReasoningUiState.Loading
111114
Log.d(TAG, "Setting _showStopNotificationFlow to true")
112115
_showStopNotificationFlow.value = true
113116
Log.d(TAG, "_showStopNotificationFlow value is now: ${_showStopNotificationFlow.value}")
114117
stopExecutionFlag.set(false)
115118

116-
val prompt = "FOLLOW THE INSTRUCTIONS STRICTLY: $userInput"
119+
val combinedPromptTextBuilder = StringBuilder(userInput)
120+
if (screenInfoForPrompt != null && screenInfoForPrompt.isNotBlank()) { // Added isNotBlank check
121+
combinedPromptTextBuilder.append("\n\nScreen Context:\n$screenInfoForPrompt")
122+
}
123+
val aiPromptText = combinedPromptTextBuilder.toString()
124+
125+
val prompt = "FOLLOW THE INSTRUCTIONS STRICTLY: $aiPromptText"
117126

118127
// Store the current user input and selected images
119-
currentUserInput = userInput
128+
currentUserInput = userInput // This should ideally store aiPromptText or handle context separately if needed for retry. For now, task is specific to prompt to AI and chat.
120129
currentSelectedImages = selectedImages
121130

122131
// Clear previous commands
@@ -125,7 +134,7 @@ class PhotoReasoningViewModel(
125134

126135
// Add user message to chat history
127136
val userMessage = PhotoReasoningMessage(
128-
text = userInput,
137+
text = aiPromptText, // Use the combined text
129138
participant = PhotoParticipant.USER,
130139
isPending = false
131140
)
@@ -932,6 +941,14 @@ class PhotoReasoningViewModel(
932941
context: Context,
933942
screenInfo: String? = null
934943
) {
944+
val currentTime = System.currentTimeMillis()
945+
if (screenshotUri == lastProcessedScreenshotUri && (currentTime - lastProcessedScreenshotTime) < 2000) { // 2-second debounce window
946+
Log.w(TAG, "addScreenshotToConversation: Debouncing duplicate/rapid call for URI $screenshotUri")
947+
return // Exit the function early if it's a duplicate call within the window
948+
}
949+
lastProcessedScreenshotUri = screenshotUri
950+
lastProcessedScreenshotTime = currentTime
951+
935952
PhotoReasoningApplication.applicationScope.launch(Dispatchers.Main) {
936953
try {
937954
Log.d(TAG, "Adding screenshot to conversation: $screenshotUri")
@@ -953,25 +970,6 @@ class PhotoReasoningViewModel(
953970
// Show toast
954971
Toast.makeText(context, "Processing screenshot...", Toast.LENGTH_SHORT).show()
955972

956-
// Create message text with screen information if available
957-
val messageText = if (screenInfo != null) {
958-
"Screenshot captured\n\n$screenInfo"
959-
} else {
960-
"Screenshot captured"
961-
}
962-
963-
// Add screenshot message to chat history
964-
val screenshotMessage = PhotoReasoningMessage(
965-
text = messageText,
966-
participant = PhotoParticipant.USER,
967-
imageUris = listOf(screenshotUri.toString())
968-
)
969-
_chatState.addMessage(screenshotMessage)
970-
_chatMessagesFlow.value = chatMessages
971-
972-
// Save chat history after adding screenshot
973-
saveChatHistory(context)
974-
975973
// Process the screenshot
976974
val imageRequest = imageRequestBuilder!!
977975
.data(screenshotUri)
@@ -998,14 +996,14 @@ class PhotoReasoningViewModel(
998996
Toast.makeText(context, "Screenshot added, sending to AI...", Toast.LENGTH_SHORT).show()
999997

1000998
// Create prompt with screen information if available
1001-
val prompt = if (screenInfo != null) {
1002-
"Analyze this screenshot. Here is the available screen information: $screenInfo"
1003-
} else {
1004-
"Analyze this screenshot"
1005-
}
999+
val genericAnalysisPrompt = "Analyze the provided screenshot and its context."
10061000

10071001
// Re-send the query with only the latest screenshot
1008-
reason(prompt, listOf(bitmap))
1002+
reason(
1003+
userInput = genericAnalysisPrompt,
1004+
selectedImages = listOf(bitmap),
1005+
screenInfoForPrompt = screenInfo
1006+
)
10091007

10101008
// Show a toast to indicate the screenshot was added
10111009
Toast.makeText(context, "Screenshot added to conversation", Toast.LENGTH_SHORT).show()

0 commit comments

Comments
 (0)