Files
2026-04-05 16:14:49 -04:00

477 lines
19 KiB
Protocol Buffer
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Copyright 2018 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package google.cloud.dialogflow.v2beta1;
import "google/api/annotations.proto";
import "google/cloud/dialogflow/v2beta1/context.proto";
import "google/cloud/dialogflow/v2beta1/intent.proto";
import "google/cloud/dialogflow/v2beta1/session_entity_type.proto";
import "google/protobuf/struct.proto";
import "google/rpc/status.proto";
import "google/type/latlng.proto";
option cc_enable_arenas = true;
option csharp_namespace = "Google.Cloud.Dialogflow.V2beta1";
option go_package = "google.golang.org/genproto/googleapis/cloud/dialogflow/v2beta1;dialogflow";
option java_multiple_files = true;
option java_outer_classname = "SessionProto";
option java_package = "com.google.cloud.dialogflow.v2beta1";
option objc_class_prefix = "DF";
// Manages user sessions.
//
// #
service Sessions {
// Processes a natural language query and returns structured, actionable data
// as a result. This method is not idempotent, because it may cause contexts
// and session entity types to be updated, which in turn might affect
// results of future queries.
rpc DetectIntent(DetectIntentRequest) returns (DetectIntentResponse) {
option (google.api.http) = { post: "/v2beta1/{session=projects/*/agent/sessions/*}:detectIntent" body: "*" };
}
// Processes a natural language query in audio format in a streaming fashion
// and returns structured, actionable data as a result. This method is only
// available via the gRPC API (not REST).
rpc StreamingDetectIntent(stream StreamingDetectIntentRequest) returns (stream StreamingDetectIntentResponse);
}
// The request to detect user's intent.
message DetectIntentRequest {
// Required. The name of the session this query is sent to. Format:
// `projects/<Project ID>/agent/sessions/<Session ID>`, or
// `projects/<Project ID>/agent/runtimes/<Runtime ID>/sessions/<Session ID>`.
// Note: Runtimes are under construction and will be available soon.
// If <Runtime ID> is not specified, we assume default 'sandbox' runtime.
// It's up to the API caller to choose an appropriate session ID. It can be
// a random number or some type of user identifier (preferably hashed).
// The length of the session ID must not exceed 36 bytes.
string session = 1;
// Optional. The parameters of this query.
QueryParameters query_params = 2;
// Required. The input specification. It can be set to:
//
// 1. an audio config
// which instructs the speech recognizer how to process the speech audio,
//
// 2. a conversational query in the form of text, or
//
// 3. an event that specifies which intent to trigger.
QueryInput query_input = 3;
// Optional. The natural language speech audio to be processed. This field
// should be populated iff `query_input` is set to an input audio config.
// A single request can contain up to 1 minute of speech audio data.
bytes input_audio = 5;
}
// The message returned from the DetectIntent method.
message DetectIntentResponse {
// The unique identifier of the response. It can be used to
// locate a response in the training example set or for reporting issues.
string response_id = 1;
// The results of the conversational query or event processing.
QueryResult query_result = 2;
// Specifies the status of the webhook request. `webhook_status`
// is never populated in webhook requests.
google.rpc.Status webhook_status = 3;
}
// Represents the parameters of the conversational query.
message QueryParameters {
// Optional. The time zone of this conversational query from the
// [time zone database](https://www.iana.org/time-zones), e.g.,
// America/New_York, Europe/Paris. If not provided, the time zone specified in
// agent settings is used.
string time_zone = 1;
// Optional. The geo location of this conversational query.
google.type.LatLng geo_location = 2;
// Optional. The collection of contexts to be activated before this query is
// executed.
repeated Context contexts = 3;
// Optional. Specifies whether to delete all contexts in the current session
// before the new ones are activated.
bool reset_contexts = 4;
// Optional. The collection of session entity types to replace or extend
// developer entities with for this query only. The entity synonyms apply
// to all languages.
repeated SessionEntityType session_entity_types = 5;
// Optional. This field can be used to pass custom data into the webhook
// associated with the agent. Arbitrary JSON objects are supported.
google.protobuf.Struct payload = 6;
}
// Represents the query input. It can contain either:
//
// 1. An audio config which
// instructs the speech recognizer how to process the speech audio.
//
// 2. A conversational query in the form of text,.
//
// 3. An event that specifies which intent to trigger.
message QueryInput {
// Required. The input specification.
oneof input {
// Instructs the speech recognizer how to process the speech audio.
InputAudioConfig audio_config = 1;
// The natural language text to be processed.
TextInput text = 2;
// The event to be processed.
EventInput event = 3;
}
}
// Represents the result of conversational query or event processing.
message QueryResult {
// The original conversational query text:
// - If natural language text was provided as input, `query_text` contains
// a copy of the input.
// - If natural language speech audio was provided as input, `query_text`
// contains the speech recognition result. If speech recognizer produced
// multiple alternatives, a particular one is picked.
// - If an event was provided as input, `query_text` is not set.
string query_text = 1;
// The language that was triggered during intent detection.
// See [Language Support](https://dialogflow.com/docs/reference/language)
// for a list of the currently supported language codes.
string language_code = 15;
// The Speech recognition confidence between 0.0 and 1.0. A higher number
// indicates an estimated greater likelihood that the recognized words are
// correct. The default of 0.0 is a sentinel value indicating that confidence
// was not set.
//
// You should not rely on this field as it isn't guaranteed to be accurate, or
// even set. In particular this field isn't set in Webhook calls and for
// StreamingDetectIntent since the streaming endpoint has separate confidence
// estimates per portion of the audio in StreamingRecognitionResult.
float speech_recognition_confidence = 2;
// The action name from the matched intent.
string action = 3;
// The collection of extracted parameters.
google.protobuf.Struct parameters = 4;
// This field is set to:
// - `false` if the matched intent has required parameters and not all of
// the required parameter values have been collected.
// - `true` if all required parameter values have been collected, or if the
// matched intent doesn't contain any required parameters.
bool all_required_params_present = 5;
// The text to be pronounced to the user or shown on the screen.
string fulfillment_text = 6;
// The collection of rich messages to present to the user.
repeated Intent.Message fulfillment_messages = 7;
// If the query was fulfilled by a webhook call, this field is set to the
// value of the `source` field returned in the webhook response.
string webhook_source = 8;
// If the query was fulfilled by a webhook call, this field is set to the
// value of the `payload` field returned in the webhook response.
google.protobuf.Struct webhook_payload = 9;
// The collection of output contexts. If applicable,
// `output_contexts.parameters` contains entries with name
// `<parameter name>.original` containing the original parameter values
// before the query.
repeated Context output_contexts = 10;
// The intent that matched the conversational query. Some, not
// all fields are filled in this message, including but not limited to:
// `name`, `display_name` and `webhook_state`.
Intent intent = 11;
// The intent detection confidence. Values range from 0.0
// (completely uncertain) to 1.0 (completely certain).
float intent_detection_confidence = 12;
// The free-form diagnostic info. For example, this field
// could contain webhook call latency.
google.protobuf.Struct diagnostic_info = 14;
}
// The top-level message sent by the client to the
// `StreamingDetectIntent` method.
//
// Multiple request messages should be sent in order:
//
// 1. The first message must contain `session`, `query_input` plus optionally
// `query_params` and/or `single_utterance`. The message must not contain `input_audio`.
//
// 2. If `query_input` was set to a streaming input audio config,
// all subsequent messages must contain only `input_audio`.
// Otherwise, finish the request stream.
message StreamingDetectIntentRequest {
// Required. The name of the session the query is sent to.
// Format of the session name:
// `projects/<Project ID>/agent/sessions/<Session ID>`, or
// `projects/<Project ID>/agent/runtimes/<Runtime ID>/sessions/<Session ID>`.
// Note: Runtimes are under construction and will be available soon.
// If <Runtime ID> is not specified, we assume default 'sandbox' runtime.
// Its up to the API caller to choose an appropriate <Session ID>. It can be
// a random number or some type of user identifier (preferably hashed).
// The length of the session ID must not exceed 36 characters.
string session = 1;
// Optional. The parameters of this query.
QueryParameters query_params = 2;
// Required. The input specification. It can be set to:
//
// 1. an audio config which instructs the speech recognizer how to process
// the speech audio,
//
// 2. a conversational query in the form of text, or
//
// 3. an event that specifies which intent to trigger.
QueryInput query_input = 3;
// Optional. If `true`, the recognizer will detect a single spoken utterance
// in input audio. When it detects that the user has paused or stopped
// speaking, it will cease recognition. This setting is ignored when
// `query_input` is a piece of text or an event.
bool single_utterance = 4;
// Optional. The input audio content to be recognized. Must be sent if
// `query_input` was set to a streaming input audio config. The complete audio
// over all streaming messages must not exceed 1 minute.
bytes input_audio = 6;
}
// The top-level message returned from the
// `StreamingDetectIntent` method.
//
// Multiple response messages can be returned in order:
//
// 1. If the input was set to streaming audio, the first one or more messages
// contain `recognition_result`. Each `recognition_result` represents a more
// complete transcript of what the user said. The last `recognition_result`
// has `is_final` set to `true`.
//
// 2. The next message contains `response_id`, `query_result`
// and optionally `webhook_status` if a WebHook was called.
message StreamingDetectIntentResponse {
// The unique identifier of the response. It can be used to
// locate a response in the training example set or for reporting issues.
string response_id = 1;
// The result of speech recognition.
StreamingRecognitionResult recognition_result = 2;
// The result of the conversational query or event processing.
QueryResult query_result = 3;
// Specifies the status of the webhook request.
google.rpc.Status webhook_status = 4;
}
// Contains a speech recognition result corresponding to a portion of the audio
// that is currently being processed or an indication that this is the end
// of the single requested utterance.
//
// Example:
//
// 1. transcript: "tube"
//
// 2. transcript: "to be a"
//
// 3. transcript: "to be"
//
// 4. transcript: "to be or not to be"
// is_final: true
//
// 5. transcript: " that's"
//
// 6. transcript: " that is"
//
// 7. recognition_event_type: `RECOGNITION_EVENT_END_OF_SINGLE_UTTERANCE`
//
// 8. transcript: " that is the question"
// is_final: true
//
// Only two of the responses contain final results (#4 and #8 indicated by
// `is_final: true`). Concatenating these generates the full transcript: "to be
// or not to be that is the question".
//
// In each response we populate:
//
// * for `MESSAGE_TYPE_TRANSCRIPT`: `transcript` and possibly `is_final`.
//
// * for `MESSAGE_TYPE_END_OF_SINGLE_UTTERANCE`: only `event_type`.
message StreamingRecognitionResult {
// Type of the response message.
enum MessageType {
// Not specified. Should never be used.
MESSAGE_TYPE_UNSPECIFIED = 0;
// Message contains a (possibly partial) transcript.
TRANSCRIPT = 1;
// Event indicates that the server has detected the end of the user's speech
// utterance and expects no additional speech. Therefore, the server will
// not process additional audio (although it may subsequently return
// additional results). The client should stop sending additional audio
// data, half-close the gRPC connection, and wait for any additional results
// until the server closes the gRPC connection. This message is only sent if
// `single_utterance` was set to `true`, and is not used otherwise.
END_OF_SINGLE_UTTERANCE = 2;
}
// Type of the result message.
MessageType message_type = 1;
// Transcript text representing the words that the user spoke.
// Populated if and only if `event_type` = `RECOGNITION_EVENT_TRANSCRIPT`.
string transcript = 2;
// The default of 0.0 is a sentinel value indicating `confidence` was not set.
// If `false`, the `StreamingRecognitionResult` represents an
// interim result that may change. If `true`, the recognizer will not return
// any further hypotheses about this piece of the audio. May only be populated
// for `event_type` = `RECOGNITION_EVENT_TRANSCRIPT`.
bool is_final = 3;
// The Speech confidence between 0.0 and 1.0 for the current portion of audio.
// A higher number indicates an estimated greater likelihood that the
// recognized words are correct. The default of 0.0 is a sentinel value
// indicating that confidence was not set.
//
// This field is typically only provided if `is_final` is true and you should
// not rely on it being accurate or even set.
float confidence = 4;
}
// Instructs the speech recognizer how to process the audio content.
message InputAudioConfig {
// Required. Audio encoding of the audio content to process.
AudioEncoding audio_encoding = 1;
// Required. Sample rate (in Hertz) of the audio content sent in the query.
// Refer to [Cloud Speech API documentation](/speech/docs/basics) for more
// details.
int32 sample_rate_hertz = 2;
// Required. The language of the supplied audio. Dialogflow does not do
// translations. See [Language
// Support](https://dialogflow.com/docs/languages) for a list of the
// currently supported language codes. Note that queries in the same session
// do not necessarily need to specify the same language.
string language_code = 3;
// Optional. The collection of phrase hints which are used to boost accuracy
// of speech recognition.
// Refer to [Cloud Speech API documentation](/speech/docs/basics#phrase-hints)
// for more details.
repeated string phrase_hints = 4;
}
// Represents the natural language text to be processed.
message TextInput {
// Required. The UTF-8 encoded natural language text to be processed.
// Text length must not exceed 256 bytes.
string text = 1;
// Required. The language of this conversational query. See [Language
// Support](https://dialogflow.com/docs/languages) for a list of the
// currently supported language codes. Note that queries in the same session
// do not necessarily need to specify the same language.
string language_code = 2;
}
// Events allow for matching intents by event name instead of the natural
// language input. For instance, input `<event: { name: “welcome_event”,
// parameters: { name: “Sam” } }>` can trigger a personalized welcome response.
// The parameter `name` may be used by the agent in the response:
// `“Hello #welcome_event.name! What can I do for you today?”`.
message EventInput {
// Required. The unique identifier of the event.
string name = 1;
// Optional. The collection of parameters associated with the event.
google.protobuf.Struct parameters = 2;
// Required. The language of this query. See [Language
// Support](https://dialogflow.com/docs/languages) for a list of the
// currently supported language codes. Note that queries in the same session
// do not necessarily need to specify the same language.
string language_code = 3;
}
// Audio encoding of the audio content sent in the conversational query request.
// Refer to the [Cloud Speech API documentation](/speech/docs/basics) for more
// details.
enum AudioEncoding {
// Not specified.
AUDIO_ENCODING_UNSPECIFIED = 0;
// Uncompressed 16-bit signed little-endian samples (Linear PCM).
AUDIO_ENCODING_LINEAR_16 = 1;
// [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio
// Codec) is the recommended encoding because it is lossless (therefore
// recognition is not compromised) and requires only about half the
// bandwidth of `LINEAR16`. `FLAC` stream encoding supports 16-bit and
// 24-bit samples, however, not all fields in `STREAMINFO` are supported.
AUDIO_ENCODING_FLAC = 2;
// 8-bit samples that compand 14-bit audio samples using G.711 PCMU/mu-law.
AUDIO_ENCODING_MULAW = 3;
// Adaptive Multi-Rate Narrowband codec. `sample_rate_hertz` must be 8000.
AUDIO_ENCODING_AMR = 4;
// Adaptive Multi-Rate Wideband codec. `sample_rate_hertz` must be 16000.
AUDIO_ENCODING_AMR_WB = 5;
// Opus encoded audio frames in Ogg container
// ([OggOpus](https://wiki.xiph.org/OggOpus)).
// `sample_rate_hertz` must be 16000.
AUDIO_ENCODING_OGG_OPUS = 6;
// Although the use of lossy encodings is not recommended, if a very low
// bitrate encoding is required, `OGG_OPUS` is highly preferred over
// Speex encoding. The [Speex](https://speex.org/) encoding supported by
// Dialogflow API has a header byte in each block, as in MIME type
// `audio/x-speex-with-header-byte`.
// It is a variant of the RTP Speex encoding defined in
// [RFC 5574](https://tools.ietf.org/html/rfc5574).
// The stream is a sequence of blocks, one block per RTP packet. Each block
// starts with a byte containing the length of the block, in bytes, followed
// by one or more frames of Speex data, padded to an integral number of
// bytes (octets) as specified in RFC 5574. In other words, each RTP header
// is replaced with a single byte containing the block length. Only Speex
// wideband is supported. `sample_rate_hertz` must be 16000.
AUDIO_ENCODING_SPEEX_WITH_HEADER_BYTE = 7;
}