voice - allow for text to speech (#211006)

This commit is contained in:
Benjamin Pasero 2024-04-26 09:33:23 +02:00 committed by GitHub
parent f86d0f7324
commit c1f470ad5a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 248 additions and 6 deletions

View File

@ -7,13 +7,18 @@ import { Emitter, Event } from 'vs/base/common/event';
import { DisposableStore, IDisposable } from 'vs/base/common/lifecycle';
import { ILogService } from 'vs/platform/log/common/log';
import { ExtHostContext, ExtHostSpeechShape, MainContext, MainThreadSpeechShape } from 'vs/workbench/api/common/extHost.protocol';
import { IKeywordRecognitionEvent, ISpeechProviderMetadata, ISpeechService, ISpeechToTextEvent } from 'vs/workbench/contrib/speech/common/speechService';
import { IKeywordRecognitionEvent, ISpeechProviderMetadata, ISpeechService, ISpeechToTextEvent, ITextToSpeechEvent } from 'vs/workbench/contrib/speech/common/speechService';
import { IExtHostContext, extHostNamedCustomer } from 'vs/workbench/services/extensions/common/extHostCustomers';
type SpeechToTextSession = {
readonly onDidChange: Emitter<ISpeechToTextEvent>;
};
type TextToSpeechSession = {
readonly onDidChange: Emitter<ITextToSpeechEvent>;
synthesize(text: string): Promise<void>;
};
type KeywordRecognitionSession = {
readonly onDidChange: Emitter<IKeywordRecognitionEvent>;
};
@ -26,6 +31,7 @@ export class MainThreadSpeech implements MainThreadSpeechShape {
private readonly providerRegistrations = new Map<number, IDisposable>();
private readonly speechToTextSessions = new Map<number, SpeechToTextSession>();
private readonly textToSpeechSessions = new Map<number, TextToSpeechSession>();
private readonly keywordRecognitionSessions = new Map<number, KeywordRecognitionSession>();
constructor(
@ -66,6 +72,36 @@ export class MainThreadSpeech implements MainThreadSpeechShape {
onDidChange: onDidChange.event
};
},
createTextToSpeechSession: (token) => {
if (token.isCancellationRequested) {
return {
onDidChange: Event.None,
synthesize: async () => { }
};
}
const disposables = new DisposableStore();
const session = Math.random();
this.proxy.$createTextToSpeechSession(handle, session);
const onDidChange = disposables.add(new Emitter<ITextToSpeechEvent>());
this.textToSpeechSessions.set(session, {
onDidChange,
synthesize: text => this.proxy.$synthesizeSpeech(session, text)
});
disposables.add(token.onCancellationRequested(() => {
this.proxy.$cancelTextToSpeechSession(session);
this.textToSpeechSessions.delete(session);
disposables.dispose();
}));
return {
onDidChange: onDidChange.event,
synthesize: text => this.proxy.$synthesizeSpeech(session, text)
};
},
createKeywordRecognitionSession: token => {
if (token.isCancellationRequested) {
return {
@ -112,6 +148,11 @@ export class MainThreadSpeech implements MainThreadSpeechShape {
providerSession?.onDidChange.fire(event);
}
$emitTextToSpeechEvent(session: number, event: ITextToSpeechEvent): void {
const providerSession = this.textToSpeechSessions.get(session);
providerSession?.onDidChange.fire(event);
}
$emitKeywordRecognitionEvent(session: number, event: IKeywordRecognitionEvent): void {
const providerSession = this.keywordRecognitionSessions.get(session);
providerSession?.onDidChange.fire(event);
@ -124,6 +165,9 @@ export class MainThreadSpeech implements MainThreadSpeechShape {
this.speechToTextSessions.forEach(session => session.onDidChange.dispose());
this.speechToTextSessions.clear();
this.textToSpeechSessions.forEach(session => session.onDidChange.dispose());
this.textToSpeechSessions.clear();
this.keywordRecognitionSessions.forEach(session => session.onDidChange.dispose());
this.keywordRecognitionSessions.clear();
}

View File

@ -1698,6 +1698,7 @@ export function createApiFactoryAndRegisterActors(accessor: ServicesAccessor): I
DebugThread: extHostTypes.DebugThread,
RelatedInformationType: extHostTypes.RelatedInformationType,
SpeechToTextStatus: extHostTypes.SpeechToTextStatus,
TextToSpeechStatus: extHostTypes.TextToSpeechStatus,
PartialAcceptTriggerKind: extHostTypes.PartialAcceptTriggerKind,
KeywordRecognitionStatus: extHostTypes.KeywordRecognitionStatus,
ChatResponseMarkdownPart: extHostTypes.ChatResponseMarkdownPart,

View File

@ -64,7 +64,7 @@ import { ICellRange } from 'vs/workbench/contrib/notebook/common/notebookRange';
import { InputValidationType } from 'vs/workbench/contrib/scm/common/scm';
import { IWorkspaceSymbol, NotebookPriorityInfo } from 'vs/workbench/contrib/search/common/search';
import { IRawClosedNotebookFileMatch } from 'vs/workbench/contrib/search/common/searchNotebookHelpers';
import { IKeywordRecognitionEvent, ISpeechProviderMetadata, ISpeechToTextEvent } from 'vs/workbench/contrib/speech/common/speechService';
import { IKeywordRecognitionEvent, ISpeechProviderMetadata, ISpeechToTextEvent, ITextToSpeechEvent } from 'vs/workbench/contrib/speech/common/speechService';
import { CoverageDetails, ExtensionRunTestsRequest, ICallProfileRunHandler, IFileCoverage, ISerializedTestResults, IStartControllerTests, ITestItem, ITestMessage, ITestRunProfile, ITestRunTask, ResolvedTestRunRequest, TestResultState, TestsDiffOp } from 'vs/workbench/contrib/testing/common/testTypes';
import { Timeline, TimelineChangeEvent, TimelineOptions, TimelineProviderDescriptor } from 'vs/workbench/contrib/timeline/common/timeline';
import { TypeHierarchyItem } from 'vs/workbench/contrib/typeHierarchy/common/typeHierarchy';
@ -1181,6 +1181,7 @@ export interface MainThreadSpeechShape extends IDisposable {
$unregisterProvider(handle: number): void;
$emitSpeechToTextEvent(session: number, event: ISpeechToTextEvent): void;
$emitTextToSpeechEvent(session: number, event: ITextToSpeechEvent): void;
$emitKeywordRecognitionEvent(session: number, event: IKeywordRecognitionEvent): void;
}
@ -1188,6 +1189,10 @@ export interface ExtHostSpeechShape {
$createSpeechToTextSession(handle: number, session: number, language?: string): Promise<void>;
$cancelSpeechToTextSession(session: number): Promise<void>;
$createTextToSpeechSession(handle: number, session: number): Promise<void>;
$synthesizeSpeech(session: number, text: string): Promise<void>;
$cancelTextToSpeechSession(session: number): Promise<void>;
$createKeywordRecognitionSession(handle: number, session: number): Promise<void>;
$cancelKeywordRecognitionSession(session: number): Promise<void>;
}

View File

@ -17,6 +17,7 @@ export class ExtHostSpeech implements ExtHostSpeechShape {
private readonly providers = new Map<number, vscode.SpeechProvider>();
private readonly sessions = new Map<number, CancellationTokenSource>();
private readonly synthesizers = new Map<number, vscode.TextToSpeechSession>();
constructor(
mainContext: IMainContext
@ -52,6 +53,46 @@ export class ExtHostSpeech implements ExtHostSpeechShape {
this.sessions.delete(session);
}
async $createTextToSpeechSession(handle: number, session: number): Promise<void> {
const provider = this.providers.get(handle);
if (!provider) {
return;
}
const disposables = new DisposableStore();
const cts = new CancellationTokenSource();
this.sessions.set(session, cts);
const textToSpeech = disposables.add(provider.provideTextToSpeechSession(cts.token));
this.synthesizers.set(session, textToSpeech);
disposables.add(textToSpeech.onDidChange(e => {
if (cts.token.isCancellationRequested) {
return;
}
this.proxy.$emitTextToSpeechEvent(session, e);
}));
disposables.add(cts.token.onCancellationRequested(() => disposables.dispose()));
}
async $synthesizeSpeech(session: number, text: string): Promise<void> {
const synthesizer = this.synthesizers.get(session);
if (!synthesizer) {
return;
}
synthesizer.synthesize(text);
}
async $cancelTextToSpeechSession(session: number): Promise<void> {
this.sessions.get(session)?.dispose(true);
this.sessions.delete(session);
this.synthesizers.delete(session);
}
async $createKeywordRecognitionSession(handle: number, session: number): Promise<void> {
const provider = this.providers.get(handle);
if (!provider) {

View File

@ -4484,6 +4484,12 @@ export enum SpeechToTextStatus {
Error = 5
}
export enum TextToSpeechStatus {
Started = 1,
Stopped = 2,
Error = 3
}
export enum KeywordRecognitionStatus {
Recognized = 1,
Stopped = 2

View File

@ -15,7 +15,7 @@ import { ChatAgentLocation, IChatAgent, IChatAgentCommand, IChatAgentData, IChat
import { IChatModel } from 'vs/workbench/contrib/chat/common/chatModel';
import { IChatProgress, IChatFollowup } from 'vs/workbench/contrib/chat/common/chatService';
import { IVoiceChatSessionOptions, IVoiceChatTextEvent, VoiceChatService } from 'vs/workbench/contrib/chat/common/voiceChat';
import { ISpeechProvider, ISpeechService, ISpeechToTextEvent, ISpeechToTextSession, KeywordRecognitionStatus, SpeechToTextStatus } from 'vs/workbench/contrib/speech/common/speechService';
import { ISpeechProvider, ISpeechService, ISpeechToTextEvent, ISpeechToTextSession, ITextToSpeechSession, KeywordRecognitionStatus, SpeechToTextStatus } from 'vs/workbench/contrib/speech/common/speechService';
import { nullExtensionDescription } from 'vs/workbench/services/extensions/common/extensions';
suite('VoiceChat', () => {
@ -75,6 +75,7 @@ suite('VoiceChat', () => {
readonly hasSpeechProvider = true;
readonly hasActiveSpeechToTextSession = false;
readonly hasActiveTextToSpeechSession = false;
readonly hasActiveKeywordRecognition = false;
registerSpeechProvider(identifier: string, provider: ISpeechProvider): IDisposable { throw new Error('Method not implemented.'); }
@ -87,6 +88,16 @@ suite('VoiceChat', () => {
};
}
onDidStartTextToSpeechSession = Event.None;
onDidEndTextToSpeechSession = Event.None;
async createTextToSpeechSession(token: CancellationToken): Promise<ITextToSpeechSession> {
return {
onDidChange: Event.None,
synthesize: async () => { }
};
}
onDidStartKeywordRecognition = Event.None;
onDidEndKeywordRecognition = Event.None;
recognizeKeyword(token: CancellationToken): Promise<KeywordRecognitionStatus> { throw new Error('Method not implemented.'); }

View File

@ -12,7 +12,7 @@ import { IContextKeyService } from 'vs/platform/contextkey/common/contextkey';
import { ILogService } from 'vs/platform/log/common/log';
import { IHostService } from 'vs/workbench/services/host/browser/host';
import { DeferredPromise } from 'vs/base/common/async';
import { ISpeechService, ISpeechProvider, HasSpeechProvider, ISpeechToTextSession, SpeechToTextInProgress, IKeywordRecognitionSession, KeywordRecognitionStatus, SpeechToTextStatus, speechLanguageConfigToLanguage, SPEECH_LANGUAGE_CONFIG } from 'vs/workbench/contrib/speech/common/speechService';
import { ISpeechService, ISpeechProvider, HasSpeechProvider, ISpeechToTextSession, SpeechToTextInProgress, IKeywordRecognitionSession, KeywordRecognitionStatus, SpeechToTextStatus, speechLanguageConfigToLanguage, SPEECH_LANGUAGE_CONFIG, ITextToSpeechSession, TextToSpeechInProgress, TextToSpeechStatus } from 'vs/workbench/contrib/speech/common/speechService';
import { ITelemetryService } from 'vs/platform/telemetry/common/telemetry';
import { IConfigurationService } from 'vs/platform/configuration/common/configuration';
import { ExtensionsRegistry } from 'vs/workbench/services/extensions/common/extensionsRegistry';
@ -126,6 +126,8 @@ export class SpeechService extends Disposable implements ISpeechService {
this._onDidChangeHasSpeechProvider.fire();
}
//#region Transcription
private readonly _onDidStartSpeechToTextSession = this._register(new Emitter<void>());
readonly onDidStartSpeechToTextSession = this._onDidStartSpeechToTextSession.event;
@ -236,6 +238,89 @@ export class SpeechService extends Disposable implements ISpeechService {
return provider;
}
//#endregion
//#region Synthesizer
private readonly _onDidStartTextToSpeechSession = this._register(new Emitter<void>());
readonly onDidStartTextToSpeechSession = this._onDidStartTextToSpeechSession.event;
private readonly _onDidEndTextToSpeechSession = this._register(new Emitter<void>());
readonly onDidEndTextToSpeechSession = this._onDidEndTextToSpeechSession.event;
private _activeTextToSpeechSession: ITextToSpeechSession | undefined = undefined;
get hasActiveTextToSpeechSession() { return !!this._activeTextToSpeechSession; }
private readonly textToSpeechInProgress = TextToSpeechInProgress.bindTo(this.contextKeyService);
async createTextToSpeechSession(token: CancellationToken, context: string = 'speech'): Promise<ITextToSpeechSession> {
const provider = await this.getProvider();
const session = this._activeTextToSpeechSession = provider.createTextToSpeechSession(token);
const sessionStart = Date.now();
let sessionError = false;
const disposables = new DisposableStore();
const onSessionStoppedOrCanceled = () => {
if (session === this._activeTextToSpeechSession) {
this._activeTextToSpeechSession = undefined;
this.textToSpeechInProgress.reset();
this._onDidEndTextToSpeechSession.fire();
type TextToSpeechSessionClassification = {
owner: 'bpasero';
comment: 'An event that fires when a text to speech session is created';
context: { classification: 'SystemMetaData'; purpose: 'FeatureInsight'; comment: 'Context of the session.' };
sessionDuration: { classification: 'SystemMetaData'; purpose: 'FeatureInsight'; comment: 'Duration of the session.' };
sessionError: { classification: 'SystemMetaData'; purpose: 'FeatureInsight'; comment: 'If speech resulted in error.' };
};
type TextToSpeechSessionEvent = {
context: string;
sessionDuration: number;
sessionError: boolean;
};
this.telemetryService.publicLog2<TextToSpeechSessionEvent, TextToSpeechSessionClassification>('textToSpeechSession', {
context,
sessionDuration: Date.now() - sessionStart,
sessionError
});
}
disposables.dispose();
};
disposables.add(token.onCancellationRequested(() => onSessionStoppedOrCanceled()));
if (token.isCancellationRequested) {
onSessionStoppedOrCanceled();
}
disposables.add(session.onDidChange(e => {
switch (e.status) {
case TextToSpeechStatus.Started:
if (session === this._activeTextToSpeechSession) {
this.textToSpeechInProgress.set(true);
this._onDidStartTextToSpeechSession.fire();
}
break;
case TextToSpeechStatus.Stopped:
onSessionStoppedOrCanceled();
break;
case TextToSpeechStatus.Error:
this.logService.error(`Speech provider error in text to speech session: ${e.text}`);
sessionError = true;
break;
}
}));
return session;
}
//#endregion
//#region Keyword Recognition
private readonly _onDidStartKeywordRecognition = this._register(new Emitter<void>());
readonly onDidStartKeywordRecognition = this._onDidStartKeywordRecognition.event;
@ -344,4 +429,6 @@ export class SpeechService extends Disposable implements ISpeechService {
onSessionStoppedOrCanceled();
}
}
//#endregion
}

View File

@ -16,6 +16,7 @@ export const ISpeechService = createDecorator<ISpeechService>('speechService');
export const HasSpeechProvider = new RawContextKey<boolean>('hasSpeechProvider', false, { type: 'string', description: localize('hasSpeechProvider', "A speech provider is registered to the speech service.") });
export const SpeechToTextInProgress = new RawContextKey<boolean>('speechToTextInProgress', false, { type: 'string', description: localize('speechToTextInProgress', "A speech-to-text session is in progress.") });
export const TextToSpeechInProgress = new RawContextKey<boolean>('textToSpeechInProgress', false, { type: 'string', description: localize('textToSpeechInProgress', "A text-to-speech session is in progress.") });
export interface ISpeechProviderMetadata {
readonly extension: ExtensionIdentifier;
@ -39,6 +40,23 @@ export interface ISpeechToTextSession {
readonly onDidChange: Event<ISpeechToTextEvent>;
}
export enum TextToSpeechStatus {
Started = 1,
Stopped = 2,
Error = 3
}
export interface ITextToSpeechEvent {
readonly status: TextToSpeechStatus;
readonly text?: string;
}
export interface ITextToSpeechSession {
readonly onDidChange: Event<ITextToSpeechEvent>;
synthesize(text: string): void;
}
export enum KeywordRecognitionStatus {
Recognized = 1,
Stopped = 2,
@ -62,6 +80,7 @@ export interface ISpeechProvider {
readonly metadata: ISpeechProviderMetadata;
createSpeechToTextSession(token: CancellationToken, options?: ISpeechToTextSessionOptions): ISpeechToTextSession;
createTextToSpeechSession(token: CancellationToken): ITextToSpeechSession;
createKeywordRecognitionSession(token: CancellationToken): IKeywordRecognitionSession;
}
@ -86,6 +105,18 @@ export interface ISpeechService {
*/
createSpeechToTextSession(token: CancellationToken, context?: string): Promise<ISpeechToTextSession>;
readonly onDidStartTextToSpeechSession: Event<void>;
readonly onDidEndTextToSpeechSession: Event<void>;
readonly hasActiveTextToSpeechSession: boolean;
/**
* Creates a synthesizer to synthesize speech from text. The returned
* session object provides a method to synthesize text and listen for
* events.
*/
createTextToSpeechSession(token: CancellationToken, context?: string): Promise<ITextToSpeechSession>;
readonly onDidStartKeywordRecognition: Event<void>;
readonly onDidEndKeywordRecognition: Event<void>;

View File

@ -5,8 +5,6 @@
declare module 'vscode' {
// todo@bpasero work in progress speech API
export interface SpeechToTextOptions {
readonly language?: string;
}
@ -28,6 +26,23 @@ declare module 'vscode' {
readonly onDidChange: Event<SpeechToTextEvent>;
}
export enum TextToSpeechStatus {
Started = 1,
Stopped = 2,
Error = 3
}
export interface TextToSpeechEvent {
readonly status: TextToSpeechStatus;
readonly text?: string;
}
export interface TextToSpeechSession extends Disposable {
readonly onDidChange: Event<TextToSpeechEvent>;
synthesize(text: string): void;
}
export enum KeywordRecognitionStatus {
Recognized = 1,
Stopped = 2
@ -44,6 +59,7 @@ declare module 'vscode' {
export interface SpeechProvider {
provideSpeechToTextSession(token: CancellationToken, options?: SpeechToTextOptions): SpeechToTextSession;
provideTextToSpeechSession(token: CancellationToken): TextToSpeechSession;
provideKeywordRecognitionSession(token: CancellationToken): KeywordRecognitionSession;
}