diff --git a/src/vs/workbench/api/browser/mainThreadSpeech.ts b/src/vs/workbench/api/browser/mainThreadSpeech.ts index 56ce1bca623..fcb28dbc417 100644 --- a/src/vs/workbench/api/browser/mainThreadSpeech.ts +++ b/src/vs/workbench/api/browser/mainThreadSpeech.ts @@ -7,13 +7,18 @@ import { Emitter, Event } from 'vs/base/common/event'; import { DisposableStore, IDisposable } from 'vs/base/common/lifecycle'; import { ILogService } from 'vs/platform/log/common/log'; import { ExtHostContext, ExtHostSpeechShape, MainContext, MainThreadSpeechShape } from 'vs/workbench/api/common/extHost.protocol'; -import { IKeywordRecognitionEvent, ISpeechProviderMetadata, ISpeechService, ISpeechToTextEvent } from 'vs/workbench/contrib/speech/common/speechService'; +import { IKeywordRecognitionEvent, ISpeechProviderMetadata, ISpeechService, ISpeechToTextEvent, ITextToSpeechEvent } from 'vs/workbench/contrib/speech/common/speechService'; import { IExtHostContext, extHostNamedCustomer } from 'vs/workbench/services/extensions/common/extHostCustomers'; type SpeechToTextSession = { readonly onDidChange: Emitter; }; +type TextToSpeechSession = { + readonly onDidChange: Emitter; + synthesize(text: string): Promise; +}; + type KeywordRecognitionSession = { readonly onDidChange: Emitter; }; @@ -26,6 +31,7 @@ export class MainThreadSpeech implements MainThreadSpeechShape { private readonly providerRegistrations = new Map(); private readonly speechToTextSessions = new Map(); + private readonly textToSpeechSessions = new Map(); private readonly keywordRecognitionSessions = new Map(); constructor( @@ -66,6 +72,36 @@ export class MainThreadSpeech implements MainThreadSpeechShape { onDidChange: onDidChange.event }; }, + createTextToSpeechSession: (token) => { + if (token.isCancellationRequested) { + return { + onDidChange: Event.None, + synthesize: async () => { } + }; + } + + const disposables = new DisposableStore(); + const session = Math.random(); + + this.proxy.$createTextToSpeechSession(handle, session); + + const onDidChange = disposables.add(new Emitter()); + this.textToSpeechSessions.set(session, { + onDidChange, + synthesize: text => this.proxy.$synthesizeSpeech(session, text) + }); + + disposables.add(token.onCancellationRequested(() => { + this.proxy.$cancelTextToSpeechSession(session); + this.textToSpeechSessions.delete(session); + disposables.dispose(); + })); + + return { + onDidChange: onDidChange.event, + synthesize: text => this.proxy.$synthesizeSpeech(session, text) + }; + }, createKeywordRecognitionSession: token => { if (token.isCancellationRequested) { return { @@ -112,6 +148,11 @@ export class MainThreadSpeech implements MainThreadSpeechShape { providerSession?.onDidChange.fire(event); } + $emitTextToSpeechEvent(session: number, event: ITextToSpeechEvent): void { + const providerSession = this.textToSpeechSessions.get(session); + providerSession?.onDidChange.fire(event); + } + $emitKeywordRecognitionEvent(session: number, event: IKeywordRecognitionEvent): void { const providerSession = this.keywordRecognitionSessions.get(session); providerSession?.onDidChange.fire(event); @@ -124,6 +165,9 @@ export class MainThreadSpeech implements MainThreadSpeechShape { this.speechToTextSessions.forEach(session => session.onDidChange.dispose()); this.speechToTextSessions.clear(); + this.textToSpeechSessions.forEach(session => session.onDidChange.dispose()); + this.textToSpeechSessions.clear(); + this.keywordRecognitionSessions.forEach(session => session.onDidChange.dispose()); this.keywordRecognitionSessions.clear(); } diff --git a/src/vs/workbench/api/common/extHost.api.impl.ts b/src/vs/workbench/api/common/extHost.api.impl.ts index 117a6108395..2b49b0f59d6 100644 --- a/src/vs/workbench/api/common/extHost.api.impl.ts +++ b/src/vs/workbench/api/common/extHost.api.impl.ts @@ -1698,6 +1698,7 @@ export function createApiFactoryAndRegisterActors(accessor: ServicesAccessor): I DebugThread: extHostTypes.DebugThread, RelatedInformationType: extHostTypes.RelatedInformationType, SpeechToTextStatus: extHostTypes.SpeechToTextStatus, + TextToSpeechStatus: extHostTypes.TextToSpeechStatus, PartialAcceptTriggerKind: extHostTypes.PartialAcceptTriggerKind, KeywordRecognitionStatus: extHostTypes.KeywordRecognitionStatus, ChatResponseMarkdownPart: extHostTypes.ChatResponseMarkdownPart, diff --git a/src/vs/workbench/api/common/extHost.protocol.ts b/src/vs/workbench/api/common/extHost.protocol.ts index e847d35fd95..16e6fb46787 100644 --- a/src/vs/workbench/api/common/extHost.protocol.ts +++ b/src/vs/workbench/api/common/extHost.protocol.ts @@ -64,7 +64,7 @@ import { ICellRange } from 'vs/workbench/contrib/notebook/common/notebookRange'; import { InputValidationType } from 'vs/workbench/contrib/scm/common/scm'; import { IWorkspaceSymbol, NotebookPriorityInfo } from 'vs/workbench/contrib/search/common/search'; import { IRawClosedNotebookFileMatch } from 'vs/workbench/contrib/search/common/searchNotebookHelpers'; -import { IKeywordRecognitionEvent, ISpeechProviderMetadata, ISpeechToTextEvent } from 'vs/workbench/contrib/speech/common/speechService'; +import { IKeywordRecognitionEvent, ISpeechProviderMetadata, ISpeechToTextEvent, ITextToSpeechEvent } from 'vs/workbench/contrib/speech/common/speechService'; import { CoverageDetails, ExtensionRunTestsRequest, ICallProfileRunHandler, IFileCoverage, ISerializedTestResults, IStartControllerTests, ITestItem, ITestMessage, ITestRunProfile, ITestRunTask, ResolvedTestRunRequest, TestResultState, TestsDiffOp } from 'vs/workbench/contrib/testing/common/testTypes'; import { Timeline, TimelineChangeEvent, TimelineOptions, TimelineProviderDescriptor } from 'vs/workbench/contrib/timeline/common/timeline'; import { TypeHierarchyItem } from 'vs/workbench/contrib/typeHierarchy/common/typeHierarchy'; @@ -1181,6 +1181,7 @@ export interface MainThreadSpeechShape extends IDisposable { $unregisterProvider(handle: number): void; $emitSpeechToTextEvent(session: number, event: ISpeechToTextEvent): void; + $emitTextToSpeechEvent(session: number, event: ITextToSpeechEvent): void; $emitKeywordRecognitionEvent(session: number, event: IKeywordRecognitionEvent): void; } @@ -1188,6 +1189,10 @@ export interface ExtHostSpeechShape { $createSpeechToTextSession(handle: number, session: number, language?: string): Promise; $cancelSpeechToTextSession(session: number): Promise; + $createTextToSpeechSession(handle: number, session: number): Promise; + $synthesizeSpeech(session: number, text: string): Promise; + $cancelTextToSpeechSession(session: number): Promise; + $createKeywordRecognitionSession(handle: number, session: number): Promise; $cancelKeywordRecognitionSession(session: number): Promise; } diff --git a/src/vs/workbench/api/common/extHostSpeech.ts b/src/vs/workbench/api/common/extHostSpeech.ts index 9093f63e3ab..abc56cedc08 100644 --- a/src/vs/workbench/api/common/extHostSpeech.ts +++ b/src/vs/workbench/api/common/extHostSpeech.ts @@ -17,6 +17,7 @@ export class ExtHostSpeech implements ExtHostSpeechShape { private readonly providers = new Map(); private readonly sessions = new Map(); + private readonly synthesizers = new Map(); constructor( mainContext: IMainContext @@ -52,6 +53,46 @@ export class ExtHostSpeech implements ExtHostSpeechShape { this.sessions.delete(session); } + async $createTextToSpeechSession(handle: number, session: number): Promise { + const provider = this.providers.get(handle); + if (!provider) { + return; + } + + const disposables = new DisposableStore(); + + const cts = new CancellationTokenSource(); + this.sessions.set(session, cts); + + const textToSpeech = disposables.add(provider.provideTextToSpeechSession(cts.token)); + this.synthesizers.set(session, textToSpeech); + + disposables.add(textToSpeech.onDidChange(e => { + if (cts.token.isCancellationRequested) { + return; + } + + this.proxy.$emitTextToSpeechEvent(session, e); + })); + + disposables.add(cts.token.onCancellationRequested(() => disposables.dispose())); + } + + async $synthesizeSpeech(session: number, text: string): Promise { + const synthesizer = this.synthesizers.get(session); + if (!synthesizer) { + return; + } + + synthesizer.synthesize(text); + } + + async $cancelTextToSpeechSession(session: number): Promise { + this.sessions.get(session)?.dispose(true); + this.sessions.delete(session); + this.synthesizers.delete(session); + } + async $createKeywordRecognitionSession(handle: number, session: number): Promise { const provider = this.providers.get(handle); if (!provider) { diff --git a/src/vs/workbench/api/common/extHostTypes.ts b/src/vs/workbench/api/common/extHostTypes.ts index ff0d6f9e5b9..551e1289cc9 100644 --- a/src/vs/workbench/api/common/extHostTypes.ts +++ b/src/vs/workbench/api/common/extHostTypes.ts @@ -4484,6 +4484,12 @@ export enum SpeechToTextStatus { Error = 5 } +export enum TextToSpeechStatus { + Started = 1, + Stopped = 2, + Error = 3 +} + export enum KeywordRecognitionStatus { Recognized = 1, Stopped = 2 diff --git a/src/vs/workbench/contrib/chat/test/common/voiceChat.test.ts b/src/vs/workbench/contrib/chat/test/common/voiceChat.test.ts index 10b5b660fd4..5e94b169a06 100644 --- a/src/vs/workbench/contrib/chat/test/common/voiceChat.test.ts +++ b/src/vs/workbench/contrib/chat/test/common/voiceChat.test.ts @@ -15,7 +15,7 @@ import { ChatAgentLocation, IChatAgent, IChatAgentCommand, IChatAgentData, IChat import { IChatModel } from 'vs/workbench/contrib/chat/common/chatModel'; import { IChatProgress, IChatFollowup } from 'vs/workbench/contrib/chat/common/chatService'; import { IVoiceChatSessionOptions, IVoiceChatTextEvent, VoiceChatService } from 'vs/workbench/contrib/chat/common/voiceChat'; -import { ISpeechProvider, ISpeechService, ISpeechToTextEvent, ISpeechToTextSession, KeywordRecognitionStatus, SpeechToTextStatus } from 'vs/workbench/contrib/speech/common/speechService'; +import { ISpeechProvider, ISpeechService, ISpeechToTextEvent, ISpeechToTextSession, ITextToSpeechSession, KeywordRecognitionStatus, SpeechToTextStatus } from 'vs/workbench/contrib/speech/common/speechService'; import { nullExtensionDescription } from 'vs/workbench/services/extensions/common/extensions'; suite('VoiceChat', () => { @@ -75,6 +75,7 @@ suite('VoiceChat', () => { readonly hasSpeechProvider = true; readonly hasActiveSpeechToTextSession = false; + readonly hasActiveTextToSpeechSession = false; readonly hasActiveKeywordRecognition = false; registerSpeechProvider(identifier: string, provider: ISpeechProvider): IDisposable { throw new Error('Method not implemented.'); } @@ -87,6 +88,16 @@ suite('VoiceChat', () => { }; } + onDidStartTextToSpeechSession = Event.None; + onDidEndTextToSpeechSession = Event.None; + + async createTextToSpeechSession(token: CancellationToken): Promise { + return { + onDidChange: Event.None, + synthesize: async () => { } + }; + } + onDidStartKeywordRecognition = Event.None; onDidEndKeywordRecognition = Event.None; recognizeKeyword(token: CancellationToken): Promise { throw new Error('Method not implemented.'); } diff --git a/src/vs/workbench/contrib/speech/browser/speechService.ts b/src/vs/workbench/contrib/speech/browser/speechService.ts index 94a45671ea3..25d5c0ce951 100644 --- a/src/vs/workbench/contrib/speech/browser/speechService.ts +++ b/src/vs/workbench/contrib/speech/browser/speechService.ts @@ -12,7 +12,7 @@ import { IContextKeyService } from 'vs/platform/contextkey/common/contextkey'; import { ILogService } from 'vs/platform/log/common/log'; import { IHostService } from 'vs/workbench/services/host/browser/host'; import { DeferredPromise } from 'vs/base/common/async'; -import { ISpeechService, ISpeechProvider, HasSpeechProvider, ISpeechToTextSession, SpeechToTextInProgress, IKeywordRecognitionSession, KeywordRecognitionStatus, SpeechToTextStatus, speechLanguageConfigToLanguage, SPEECH_LANGUAGE_CONFIG } from 'vs/workbench/contrib/speech/common/speechService'; +import { ISpeechService, ISpeechProvider, HasSpeechProvider, ISpeechToTextSession, SpeechToTextInProgress, IKeywordRecognitionSession, KeywordRecognitionStatus, SpeechToTextStatus, speechLanguageConfigToLanguage, SPEECH_LANGUAGE_CONFIG, ITextToSpeechSession, TextToSpeechInProgress, TextToSpeechStatus } from 'vs/workbench/contrib/speech/common/speechService'; import { ITelemetryService } from 'vs/platform/telemetry/common/telemetry'; import { IConfigurationService } from 'vs/platform/configuration/common/configuration'; import { ExtensionsRegistry } from 'vs/workbench/services/extensions/common/extensionsRegistry'; @@ -126,6 +126,8 @@ export class SpeechService extends Disposable implements ISpeechService { this._onDidChangeHasSpeechProvider.fire(); } + //#region Transcription + private readonly _onDidStartSpeechToTextSession = this._register(new Emitter()); readonly onDidStartSpeechToTextSession = this._onDidStartSpeechToTextSession.event; @@ -236,6 +238,89 @@ export class SpeechService extends Disposable implements ISpeechService { return provider; } + //#endregion + + //#region Synthesizer + + private readonly _onDidStartTextToSpeechSession = this._register(new Emitter()); + readonly onDidStartTextToSpeechSession = this._onDidStartTextToSpeechSession.event; + + private readonly _onDidEndTextToSpeechSession = this._register(new Emitter()); + readonly onDidEndTextToSpeechSession = this._onDidEndTextToSpeechSession.event; + + private _activeTextToSpeechSession: ITextToSpeechSession | undefined = undefined; + get hasActiveTextToSpeechSession() { return !!this._activeTextToSpeechSession; } + + private readonly textToSpeechInProgress = TextToSpeechInProgress.bindTo(this.contextKeyService); + + async createTextToSpeechSession(token: CancellationToken, context: string = 'speech'): Promise { + const provider = await this.getProvider(); + + const session = this._activeTextToSpeechSession = provider.createTextToSpeechSession(token); + + const sessionStart = Date.now(); + let sessionError = false; + + const disposables = new DisposableStore(); + + const onSessionStoppedOrCanceled = () => { + if (session === this._activeTextToSpeechSession) { + this._activeTextToSpeechSession = undefined; + this.textToSpeechInProgress.reset(); + this._onDidEndTextToSpeechSession.fire(); + + type TextToSpeechSessionClassification = { + owner: 'bpasero'; + comment: 'An event that fires when a text to speech session is created'; + context: { classification: 'SystemMetaData'; purpose: 'FeatureInsight'; comment: 'Context of the session.' }; + sessionDuration: { classification: 'SystemMetaData'; purpose: 'FeatureInsight'; comment: 'Duration of the session.' }; + sessionError: { classification: 'SystemMetaData'; purpose: 'FeatureInsight'; comment: 'If speech resulted in error.' }; + }; + type TextToSpeechSessionEvent = { + context: string; + sessionDuration: number; + sessionError: boolean; + }; + this.telemetryService.publicLog2('textToSpeechSession', { + context, + sessionDuration: Date.now() - sessionStart, + sessionError + }); + } + + disposables.dispose(); + }; + + disposables.add(token.onCancellationRequested(() => onSessionStoppedOrCanceled())); + if (token.isCancellationRequested) { + onSessionStoppedOrCanceled(); + } + + disposables.add(session.onDidChange(e => { + switch (e.status) { + case TextToSpeechStatus.Started: + if (session === this._activeTextToSpeechSession) { + this.textToSpeechInProgress.set(true); + this._onDidStartTextToSpeechSession.fire(); + } + break; + case TextToSpeechStatus.Stopped: + onSessionStoppedOrCanceled(); + break; + case TextToSpeechStatus.Error: + this.logService.error(`Speech provider error in text to speech session: ${e.text}`); + sessionError = true; + break; + } + })); + + return session; + } + + //#endregion + + //#region Keyword Recognition + private readonly _onDidStartKeywordRecognition = this._register(new Emitter()); readonly onDidStartKeywordRecognition = this._onDidStartKeywordRecognition.event; @@ -344,4 +429,6 @@ export class SpeechService extends Disposable implements ISpeechService { onSessionStoppedOrCanceled(); } } + + //#endregion } diff --git a/src/vs/workbench/contrib/speech/common/speechService.ts b/src/vs/workbench/contrib/speech/common/speechService.ts index 4f260469983..4bd76b641aa 100644 --- a/src/vs/workbench/contrib/speech/common/speechService.ts +++ b/src/vs/workbench/contrib/speech/common/speechService.ts @@ -16,6 +16,7 @@ export const ISpeechService = createDecorator('speechService'); export const HasSpeechProvider = new RawContextKey('hasSpeechProvider', false, { type: 'string', description: localize('hasSpeechProvider', "A speech provider is registered to the speech service.") }); export const SpeechToTextInProgress = new RawContextKey('speechToTextInProgress', false, { type: 'string', description: localize('speechToTextInProgress', "A speech-to-text session is in progress.") }); +export const TextToSpeechInProgress = new RawContextKey('textToSpeechInProgress', false, { type: 'string', description: localize('textToSpeechInProgress', "A text-to-speech session is in progress.") }); export interface ISpeechProviderMetadata { readonly extension: ExtensionIdentifier; @@ -39,6 +40,23 @@ export interface ISpeechToTextSession { readonly onDidChange: Event; } +export enum TextToSpeechStatus { + Started = 1, + Stopped = 2, + Error = 3 +} + +export interface ITextToSpeechEvent { + readonly status: TextToSpeechStatus; + readonly text?: string; +} + +export interface ITextToSpeechSession { + readonly onDidChange: Event; + + synthesize(text: string): void; +} + export enum KeywordRecognitionStatus { Recognized = 1, Stopped = 2, @@ -62,6 +80,7 @@ export interface ISpeechProvider { readonly metadata: ISpeechProviderMetadata; createSpeechToTextSession(token: CancellationToken, options?: ISpeechToTextSessionOptions): ISpeechToTextSession; + createTextToSpeechSession(token: CancellationToken): ITextToSpeechSession; createKeywordRecognitionSession(token: CancellationToken): IKeywordRecognitionSession; } @@ -86,6 +105,18 @@ export interface ISpeechService { */ createSpeechToTextSession(token: CancellationToken, context?: string): Promise; + readonly onDidStartTextToSpeechSession: Event; + readonly onDidEndTextToSpeechSession: Event; + + readonly hasActiveTextToSpeechSession: boolean; + + /** + * Creates a synthesizer to synthesize speech from text. The returned + * session object provides a method to synthesize text and listen for + * events. + */ + createTextToSpeechSession(token: CancellationToken, context?: string): Promise; + readonly onDidStartKeywordRecognition: Event; readonly onDidEndKeywordRecognition: Event; diff --git a/src/vscode-dts/vscode.proposed.speech.d.ts b/src/vscode-dts/vscode.proposed.speech.d.ts index 9ece68528b9..4e0ad0031ce 100644 --- a/src/vscode-dts/vscode.proposed.speech.d.ts +++ b/src/vscode-dts/vscode.proposed.speech.d.ts @@ -5,8 +5,6 @@ declare module 'vscode' { - // todo@bpasero work in progress speech API - export interface SpeechToTextOptions { readonly language?: string; } @@ -28,6 +26,23 @@ declare module 'vscode' { readonly onDidChange: Event; } + export enum TextToSpeechStatus { + Started = 1, + Stopped = 2, + Error = 3 + } + + export interface TextToSpeechEvent { + readonly status: TextToSpeechStatus; + readonly text?: string; + } + + export interface TextToSpeechSession extends Disposable { + readonly onDidChange: Event; + + synthesize(text: string): void; + } + export enum KeywordRecognitionStatus { Recognized = 1, Stopped = 2 @@ -44,6 +59,7 @@ declare module 'vscode' { export interface SpeechProvider { provideSpeechToTextSession(token: CancellationToken, options?: SpeechToTextOptions): SpeechToTextSession; + provideTextToSpeechSession(token: CancellationToken): TextToSpeechSession; provideKeywordRecognitionSession(token: CancellationToken): KeywordRecognitionSession; }