Skip to content

Commit

Permalink
Chat: add vision support, upload images
Browse files Browse the repository at this point in the history
Previously, it was only possible to send text to Cody via Chat. This PR
adds new support to additionally send Cody images. This can be helpful,
for example, when you want to write a basic HTML structure based on
a Figma design.
  • Loading branch information
olafurpg authored and arafatkatze committed Feb 2, 2025
1 parent f32989a commit 2e42f63
Show file tree
Hide file tree
Showing 15 changed files with 254 additions and 11 deletions.
9 changes: 5 additions & 4 deletions lib/shared/src/chat/chat.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,14 @@ export class ChatClient {

// We only want to send up the speaker and prompt text, regardless of whatever other fields
// might be on the messages objects (`file`, `displayText`, `contextFiles`, etc.).
const messagesToSend = augmentedMessages.map(({ speaker, text, cache_enabled }) => ({
const messagesToSend = augmentedMessages.map(({ speaker, text, cache_enabled, content }) => ({
text,
speaker,
cache_enabled,
content,
}))

const completionParams = {
const completionParams: CompletionParameters = {
...DEFAULT_CHAT_COMPLETION_PARAMETERS,
...params,
messages: messagesToSend,
Expand Down Expand Up @@ -108,8 +109,8 @@ export function sanitizeMessages(messages: Message[]): Message[] {
// the next one
const nextMessage = sanitizedMessages[index + 1]
if (
(nextMessage.speaker === 'assistant' && !nextMessage.text?.length) ||
(message.speaker === 'assistant' && !message.text?.length)
(nextMessage.speaker === 'assistant' && !nextMessage.text?.length && !nextMessage.content) ||
(message.speaker === 'assistant' && !message.text?.length && !message.content)
) {
return false
}
Expand Down
1 change: 1 addition & 0 deletions lib/shared/src/chat/transcript/messages.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export interface SubMessage {

export interface ChatMessage extends Message {
contextFiles?: ContextItem[]
base64Image?: string

contextAlternatives?: RankedContext[]

Expand Down
1 change: 1 addition & 0 deletions lib/shared/src/configuration.ts
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ interface RawClientConfiguration {
experimentalMinionAnthropicKey: string | undefined
experimentalNoxideEnabled: boolean
experimentalGuardrailsTimeoutSeconds: number | undefined
experimentalImageUpload: boolean

//#region Unstable
internalUnstable: boolean
Expand Down
7 changes: 7 additions & 0 deletions lib/shared/src/sourcegraph-api/completions/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ export interface Message {
// mirrors what OpenAI and Anthropic expect
text?: PromptString
cache_enabled?: boolean | null
content?: string | MessagePart[]
base64Image?: string
}

export interface CompletionUsage {
Expand All @@ -43,6 +45,10 @@ export interface CompletionResponse {
stopReason?: string
}

type MessagePart =
| { type: 'text'; text: string } // a normal text message
| { type: 'image_url'; image_url: { url: string } } // image message, per https://platform.openai.com/docs/guides/vision

export interface CompletionParameters {
fast?: boolean
messages: Message[]
Expand All @@ -61,6 +67,7 @@ export interface CompletionParameters {
type: 'content'
content: string
}
base64Image?: string
}

export interface SerializedCompletionParameters extends Omit<CompletionParameters, 'messages'> {
Expand Down
6 changes: 6 additions & 0 deletions vscode/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -1084,6 +1084,12 @@
"markdownDescription": "Enable OpenTelemetry tracing",
"default": false
},
"cody.experimental.imageUpload": {
"order": 99,
"type": "boolean",
"markdownDescription": "Enable image support",
"default": false
},
"cody.experimental.commitMessage": {
"order": 99,
"type": "boolean",
Expand Down
27 changes: 26 additions & 1 deletion vscode/src/chat/chat-view/ChatBuilder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ export class ChatBuilder {
if (this.messages.at(-1)?.speaker === 'human') {
throw new Error('Cannot add a user message after a user message')
}
this.messages.push({ ...message, speaker: 'human' })
this.messages.push({ ...message, speaker: 'human', base64Image: this.getAndResetImage() })
this.changeNotifications.next()
}

Expand Down Expand Up @@ -322,6 +322,31 @@ export class ChatBuilder {
}
return result
}

/**
* Store the base64-encoded image uploaded by user to a multi-modal model.
* Requires vision support in the model, added in the PR
* https://github.com/sourcegraph/sourcegraph/pull/546
*/
private image: string | undefined = undefined

/**
* Sets the base64-encoded image for the chat model.
* @param base64Image - The base64-encoded image data to set.
*/
public setImage(base64Image: string): void {
this.image = base64Image
}

/**
* Gets the base64-encoded image for the chat model and resets the internal image property to undefined.
* @returns The base64-encoded image, or undefined if no image has been set.
*/
public getAndResetImage(): string | undefined {
const image = this.image
this.image = undefined
return image
}
}

function messageToSerializedChatInteraction(
Expand Down
5 changes: 5 additions & 0 deletions vscode/src/chat/chat-view/ChatController.ts
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,11 @@ export class ChatController implements vscode.Disposable, vscode.WebviewViewProv
}
break
}

case 'chat/upload-file': {
this.chatBuilder.setImage(message.base64)
break
}
case 'log': {
const logger = message.level === 'debug' ? logDebug : logError
logger(message.filterLabel, message.message)
Expand Down
3 changes: 3 additions & 0 deletions vscode/src/chat/chat-view/prompt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ export class DefaultPrompter {
`Ignored ${messagesIgnored} chat messages due to context limit`
)
}
for (const message of reverseTranscript) {
promptBuilder.tryAddImage(message.base64Image)
}
// Counter for context items categorized by source
const ignoredContext = { user: 0, corpus: 0, transcript: 0 }

Expand Down
1 change: 1 addition & 0 deletions vscode/src/chat/protocol.ts
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ export type WebviewMessage =
selectedFilters: NLSSearchDynamicFilter[]
}
| { command: 'action/confirmation'; id: string; response: boolean }
| { command: 'chat/upload-file'; base64: string }

export interface SmartApplyResult {
taskId: FixupTaskID
Expand Down
1 change: 1 addition & 0 deletions vscode/src/configuration.ts
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ export function getConfiguration(
>('autocomplete.experimental.graphContext', null),
experimentalCommitMessage: getHiddenSetting('experimental.commitMessage', true),
experimentalNoodle: getHiddenSetting('experimental.noodle', false),
experimentalImageUpload: getHiddenSetting('experimental.imageUpload', false),

experimentalTracing: getHiddenSetting('experimental.tracing', false),

Expand Down
43 changes: 42 additions & 1 deletion vscode/src/prompt-builder/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ export class PromptBuilder {
* A list of context items that are used to build context messages.
*/
public contextItems: ContextItem[] = []
public images: string[] = []

/**
* Convenience constructor because loading the tokenizer is async due to its large size.
Expand All @@ -47,10 +48,29 @@ export class PromptBuilder {
if (this.contextItems.length > 0) {
this.buildContextMessages()
}

this.buildImageMessages()
return this.prefixMessages.concat([...this.reverseMessages].reverse())
}

private buildImageMessages(): void {
for (const image of this.images) {
// Detect image type from the base64 header
const imageType = detectImageType(image)
const imageMessage: Message = {
speaker: 'human',
content: [
{
type: 'image_url',
image_url: {
url: `data:${imageType};base64,${image}`,
},
},
],
}
this.reverseMessages.push(...[ASSISTANT_MESSAGE, imageMessage])
}
}

private buildContextMessages(): void {
for (const item of this.contextItems) {
// Create context messages for each context item, where
Expand Down Expand Up @@ -108,6 +128,12 @@ export class PromptBuilder {
return undefined
}

public tryAddImage(base64Image: string | undefined): void {
if (base64Image) {
this.images.push(base64Image)
}
}

public async tryAddContext(
type: ContextTokenUsageType | 'history',
contextItems: ContextItem[]
Expand Down Expand Up @@ -189,3 +215,18 @@ export class PromptBuilder {
return result
}
}

function detectImageType(base64String: string): string {
// Check the first few bytes of the base64 string to determine image type
const header = base64String.substring(0, 8)

// Common image signatures in base64
if (header.startsWith('/9j/')) return 'image/jpeg'
if (header.startsWith('iVBORw0')) return 'image/png'
if (header.startsWith('R0lGOD')) return 'image/gif'
if (header.startsWith('UklGR')) return 'image/webp'
if (header.startsWith('PHN2Z')) return 'image/svg+xml'

// Default to jpeg if unknown
return 'image/jpeg'
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import {
import type { UserAccountInfo } from '../../../../../Chat'
import { type ClientActionListener, useClientActionListener } from '../../../../../client/clientState'
import { promptModeToIntent } from '../../../../../prompts/PromptsTab'
import { getVSCodeAPI } from '../../../../../utils/VSCodeApi'
import { useTelemetryRecorder } from '../../../../../utils/telemetry'
import { useFeatureFlag } from '../../../../../utils/useFeatureFlags'
import { useLinkOpener } from '../../../../../utils/useLinkOpener'
Expand Down Expand Up @@ -99,6 +100,8 @@ export const HumanMessageEditor: FunctionComponent<{
}) => {
const telemetryRecorder = useTelemetryRecorder()

const [imageFile, setImageFile] = useState<File | undefined>(undefined)

const editorRef = useRef<PromptEditorRefAPI>(null)
useImperativeHandle(parentEditorRef, (): PromptEditorRefAPI | null => editorRef.current, [])

Expand Down Expand Up @@ -126,7 +129,7 @@ export const HumanMessageEditor: FunctionComponent<{
const experimentalPromptEditorEnabled = useFeatureFlag(FeatureFlag.CodyExperimentalPromptEditor)

const onSubmitClick = useCallback(
(intent?: ChatMessage['intent'], forceSubmit?: boolean): void => {
async (intent?: ChatMessage['intent'], forceSubmit?: boolean): Promise<void> => {
if (!forceSubmit && submitState === 'emptyEditorValue') {
return
}
Expand All @@ -142,6 +145,28 @@ export const HumanMessageEditor: FunctionComponent<{

const value = editorRef.current.getSerializedValue()
parentOnSubmit(intent)
if (imageFile) {
const readFileGetBase64String = (file: File): Promise<string> => {
return new Promise((resolve, reject) => {
const reader = new FileReader()
reader.onload = () => {
const base64 = reader.result
if (base64 && typeof base64 === 'string') {
resolve(base64.split(',')[1])
} else {
reject(new Error('Failed to read file'))
}
}
reader.onerror = () => reject(new Error('Failed to read file'))
reader.readAsDataURL(file)
})
}

const base64 = await readFileGetBase64String(imageFile)
getVSCodeAPI().postMessage({ command: 'chat/upload-file', base64 })
setImageFile(undefined)
}
parentOnSubmit(intent)

telemetryRecorder.recordEvent('cody.humanMessageEditor', 'submit', {
metadata: {
Expand All @@ -157,7 +182,15 @@ export const HumanMessageEditor: FunctionComponent<{
},
})
},
[submitState, parentOnSubmit, onStop, telemetryRecorder.recordEvent, isFirstMessage, isSent]
[
submitState,
parentOnSubmit,
onStop,
telemetryRecorder.recordEvent,
isFirstMessage,
isSent,
imageFile,
]
)

const onEditorEnterKey = useCallback(
Expand Down Expand Up @@ -470,6 +503,8 @@ export const HumanMessageEditor: FunctionComponent<{
hidden={!focused && isSent}
className={styles.toolbar}
intent={intent}
imageFile={imageFile}
setImageFile={setImageFile}
/>
)}
</div>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import type { Action, ChatMessage, Model } from '@sourcegraph/cody-shared'
import { useExtensionAPI } from '@sourcegraph/prompt-editor'
import type { ResolvedConfiguration } from '@sourcegraph/cody-shared'
import { useExtensionAPI, useObservable } from '@sourcegraph/prompt-editor'
import clsx from 'clsx'
import { type FunctionComponent, useCallback } from 'react'
import { type FunctionComponent, useCallback, useMemo } from 'react'
import type { UserAccountInfo } from '../../../../../../Chat'
import { ModelSelectField } from '../../../../../../components/modelSelectField/ModelSelectField'
import { PromptSelectField } from '../../../../../../components/promptSelectField/PromptSelectField'
Expand All @@ -10,10 +11,15 @@ import { useActionSelect } from '../../../../../../prompts/PromptsTab'
import { useClientConfig } from '../../../../../../utils/useClientConfig'
import { AddContextButton } from './AddContextButton'
import { SubmitButton, type SubmitButtonState } from './SubmitButton'

import { UploadImageButton } from './UploadImageButton'
/**
* The toolbar for the human message editor.
*/
function useResolvedConfig(): ResolvedConfiguration | undefined {
const resolvedConfig = useExtensionAPI().resolvedConfig
return useObservable(useMemo(() => resolvedConfig(), [resolvedConfig])).value
}

export const Toolbar: FunctionComponent<{
models: Model[]
userInfo: UserAccountInfo
Expand All @@ -35,6 +41,9 @@ export const Toolbar: FunctionComponent<{
intent?: ChatMessage['intent']

manuallySelectIntent: (intent: ChatMessage['intent']) => void

imageFile?: File
setImageFile: (file: File | undefined) => void
}> = ({
userInfo,
isEditorFocused,
Expand All @@ -48,6 +57,8 @@ export const Toolbar: FunctionComponent<{
models,
intent,
manuallySelectIntent,
imageFile,
setImageFile,
}) => {
/**
* If the user clicks in a gap or on the toolbar outside of any of its buttons, report back to
Expand All @@ -64,6 +75,8 @@ export const Toolbar: FunctionComponent<{
},
[onGapClick]
)
const resolvedConfig = useResolvedConfig()
const imageUploadEnabled = resolvedConfig?.configuration.experimentalImageUpload

return (
// biome-ignore lint/a11y/useKeyWithClickEvents: only relevant to click areas
Expand All @@ -88,6 +101,13 @@ export const Toolbar: FunctionComponent<{
/>
)}
<PromptSelectFieldToolbarItem focusEditor={focusEditor} className="tw-ml-1 tw-mr-1" />
{imageUploadEnabled && (
<UploadImageButton
className="tw-opacity-60"
imageFile={imageFile}
onClick={setImageFile}
/>
)}
<ModelSelectFieldToolbarItem
models={models}
userInfo={userInfo}
Expand Down
Loading

0 comments on commit 2e42f63

Please sign in to comment.