Streamed Inference 1

(“stream”=true, return in SSE format):


data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":"\t"},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":"\t"},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":null}]}

data: {"id":"endpoint_common_8","object":"chat.completion.chunk","created":1729614610,"model":"DeepSeek-R1","usage":{"prompt_tokens":54,"completion_tokens":17,"total_tokens":71},"choices":[{"index":0,"delta":{"role":"assistant","content":""},"finish_reason":"stop"}]}

data: [DONE]

Streamed Inference 2

(“stream”=true, with configuration “fullTextEnabled”=true, return in SSE format):

data: {"id":"endpoint_common_11","object":"chat.completion.chunk","created":1730184192,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello"},"finish_reason":null}]}

data: {"id":"endpoint_common_11","object":"chat.completion.chunk","created":1730184192,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello!"},"finish_reason":null}]}

data: {"id":"endpoint_common_11","object":"chat.completion.chunk","created":1730184192,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello! How"},"finish_reason":null}]}

data: {"id":"endpoint_common_11","object":"chat.completion.chunk","created":1730184192,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello! How can"},"finish_reason":null}]}

data: {"id":"endpoint_common_11","object":"chat.completion.chunk","created":1730184192,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello! How can I"},"finish_reason":null}]}

data: {"id":"endpoint_common_11","object":"chat.completion.chunk","created":1730184192,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello! How can I assist"},"finish_reason":null}]}

data: {"id":"endpoint_common_11","object":"chat.completion.chunk","created":1730184192,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello! How can I assist you"},"finish_reason":null}]}

data: {"id":"endpoint_common_11","object":"chat.completion.chunk","created":1730184192,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello! How can I assist you today"},"finish_reason":null}]}

data: {"id":"endpoint_common_11","object":"chat.completion.chunk","created":1730184192,"model":"DeepSeek-R1","choices":[{"index":0,"delta":{"role":"assistant","content":"Hello! How can I assist you today?"},"finish_reason":null}]}

data: {"id":"endpoint_common_11","object":"chat.completion.chunk","created":1730184192,"model":"DeepSeek-R1","full_text":"Hello! How can I assist you today?","usage":{"prompt_tokens":31,"completion_tokens":10,"total_tokens":41},"choices":[{"index":0,"delta":{"role":"assistant","content":"Hello! How can I assist you today?"},"finish_reason":"length"}]}

data: [DONE]

Output Description

Table 1

Text Inference Result Description

Parameter Name

Type

Description

id

string

Request ID.

object

string

The return result type, currently always “chat.completion”.

created

integer

Inference request timestamp, accurate to the second.

model

string

Inference model used.

choices

list

List of inference results.

-

index

integer

Choice message index, currently only 0 is allowed.

message

object

Inference message.

-

role

string

Role, currently always “assistant”.

content

string

Inference text result.

tool_calls

list

Model tool call output.

-

function

dict

Function call description.

-

arguments

string

Arguments for calling the function, in JSON string format.

name

string

Name of the called function.

id

string

Tool call ID for the model’s tool invocation.

type

string

Tool type, currently only supports “function”.

finish_reason

string

Reason for completion.

  • stop:

    • The request was CANCELLED or STOPPED, not visible to the user, and the response is discarded.
    • An error occurred during the request execution, and the response output is empty, with the err_msg non-empty.
    • An input validation exception occurred during the request, and the response output is empty, with the err_msg non-empty.
    • The request ends normally due to encountering the eos (end-of-sequence) symbol.
  • length:

    • The request ends due to reaching the maximum sequence length, and the response is the output of the last iteration.
    • The request ends due to reaching the maximum output length (including request and model granularity), and the response is the output of the last iteration.
  • tool_calls: Indicates that the model invoked a tool.

usage

object

Inference result statistics data.

-

prompt_tokens

int

Token length of the user’s input prompt text.

completion_tokens

int

Number of tokens in the inference result. In the PD scenario, it counts the total token number of P and D inference results. When the maximum inference length of a request is set to maxIterTimes, the D node’s response will have completion_tokens equal to maxIterTimes+1, which includes the first token of the P inference result.

total_tokens

int

Total number of tokens for the request and inference.

prefill_time

float

Time delay for the first token of inference.

decode_time_arr

list

Array of decoding time delays for inference.

Table 2

Streamed Inference Result Description

Parameter Name

Type

Description

data

object

Result returned from a single inference.

-

id

string

Request ID.

object

string

Currently always returns “chat.completion.chunk”.

created

integer

Inference request timestamp, accurate to the second.

model

string

The inference model used.

full_text

string

Full text result, only available when the configuration item “fullTextEnabled” is set to true.

usage

object

Inference result statistics.

-

prompt_tokens

int

Token length of the user input prompt text.

completion_tokens

int

Number of tokens in the inference result. In PD scenarios, this counts the total tokens from both P and D inference results. When the inference length limit of a request is set to maxIterTimes, the D node response will have a completion_tokens count of maxIterTimes+1, meaning it includes the first token of the P inference result.

total_tokens

int

Total number of tokens for the request and inference.

choices

list

Streaming inference results.

-

index

integer

Choices message index, currently only 0 is supported.

delta

object

Inference result returned, the last response is empty.

-

role

string

Role, currently always returns “assistant”.

content

string

Inference text result.

finish_reason

string

Reason for finishing, only returned in the last inference result.

  • stop:

    • The request was CANCELLED or STOPPED, the user is unaware, and the response is discarded.
    • An error occurred during the execution of the request, the response is empty, and err_msg is not empty.
    • The request input validation failed, the response is empty, and err_msg is not empty.
    • The request finished normally when encountering the eos (end-of-stream) delimiter.
  • length:

    • The request ended because the maximum sequence length was reached, the response is the output from the last iteration.
    • The request ended because the maximum output length (including request and model granularity) was reached, the response is the output from the last iteration.