refactor(api): migrate console/service_api.dataset.segment to BaseModel (#36522)

Co-authored-by: WH-2099 <wh2099@pm.me>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
This commit is contained in:
chariri
2026-05-30 22:54:01 +09:00
committed by GitHub
parent f46c03460e
commit 928f888ef5
17 changed files with 1906 additions and 648 deletions
+62 -2
View File
@@ -6,10 +6,11 @@ These helpers keep that translation centralized so models registered through
`register_schema_models` emit resolvable Swagger 2.0 references.
"""
from collections.abc import Mapping
from collections.abc import Iterable, Mapping
from enum import StrEnum
from typing import Any, Literal, NotRequired, TypedDict
from typing import Any, Literal, NotRequired, Protocol, TypedDict
from flask import request
from flask_restx import Namespace
from pydantic import BaseModel, TypeAdapter
@@ -36,6 +37,12 @@ QueryParamDoc = TypedDict(
)
class QueryArgs(Protocol):
def to_dict(self, flat: bool = True) -> dict[str, str]: ...
def getlist(self, key: str) -> list[str]: ...
def _register_json_schema(namespace: Namespace, name: str, schema: dict) -> None:
"""Register a JSON schema and promote any nested Pydantic `$defs`."""
@@ -167,6 +174,58 @@ def query_params_from_model(model: type[BaseModel]) -> dict[str, QueryParamDoc]:
return params
def query_params_from_request[ModelT: BaseModel](
model: type[ModelT],
*,
list_fields: Iterable[str] = (),
args: QueryArgs | None = None,
use_defaults_for_malformed_ints: bool = False,
) -> ModelT:
"""Validate query args with Pydantic while preserving Flask query parsing behavior.
Repeated params need explicit ``getlist()`` handling because Werkzeug's
``to_dict()`` keeps only one value. For malformed scalar integers, Flask's
For endpoints migrated from ``request.args.get(..., type=int, default=...)``,
set ``use_defaults_for_malformed_ints`` to preserve Flask's fallback to
defaults for malformed optional integer params.
"""
query_args = args or request.args
params: dict[str, Any] = query_args.to_dict()
for field_name in list_fields:
params[field_name] = query_args.getlist(field_name)
if use_defaults_for_malformed_ints:
_drop_malformed_defaulted_integer_params(model, params)
return model.model_validate(params)
def _drop_malformed_defaulted_integer_params(model: type[BaseModel], params: dict[str, Any]) -> None:
properties = model.model_json_schema(ref_template=DEFAULT_REF_TEMPLATE_SWAGGER_2_0).get("properties", {})
if not isinstance(properties, Mapping):
return
for name, value in list(params.items()):
if not isinstance(value, str):
continue
field = model.model_fields.get(name)
if field is None or field.is_required():
continue
property_schema = properties.get(name)
if not isinstance(property_schema, Mapping):
continue
if _nullable_property_schema(property_schema).get("type") != "integer":
continue
try:
int(value)
except ValueError:
params.pop(name)
def _query_param_from_property(property_schema: Mapping[str, Any], *, required: bool) -> QueryParamDoc:
param_schema = _nullable_property_schema(property_schema)
param_doc: QueryParamDoc = {"in": "query", "required": required}
@@ -239,6 +298,7 @@ __all__ = [
"DEFAULT_REF_TEMPLATE_SWAGGER_2_0",
"get_or_create_model",
"query_params_from_model",
"query_params_from_request",
"register_enum_models",
"register_response_schema_model",
"register_response_schema_models",
@@ -1,9 +1,10 @@
import uuid
from typing import Literal
from typing import cast as type_cast
from uuid import UUID
from flask import request
from flask_restx import Resource, marshal
from flask_restx import Resource
from pydantic import BaseModel, Field
from sqlalchemy import String, case, cast, func, literal, or_, select
from sqlalchemy.dialects.postgresql import JSONB
@@ -13,7 +14,12 @@ import services
from configs import dify_config
from controllers.common.controller_schemas import ChildChunkCreatePayload, ChildChunkUpdatePayload
from controllers.common.fields import SimpleResultResponse
from controllers.common.schema import register_response_schema_models, register_schema_models
from controllers.common.schema import (
query_params_from_model,
query_params_from_request,
register_response_schema_models,
register_schema_models,
)
from controllers.console import console_ns
from controllers.console.app.error import ProviderNotInitializeError
from controllers.console.datasets.error import (
@@ -34,9 +40,17 @@ from core.rag.index_processor.constant.index_type import IndexTechniqueType
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from fields.base import ResponseModel
from fields.segment_fields import child_chunk_fields, segment_fields
from fields.segment_fields import (
ChildChunkDetailResponse,
ChildChunkListResponse,
ChildChunkResponse,
SegmentDetailResponse,
SegmentResponse,
segment_response_with_summary,
segment_responses_with_summaries,
)
from graphon.model_runtime.entities.model_entities import ModelType
from libs.helper import escape_like_pattern
from libs.helper import dump_response, escape_like_pattern
from libs.login import current_account_with_tenant, login_required
from models.dataset import ChildChunk, DocumentSegment
from models.model import UploadFile
@@ -44,20 +58,10 @@ from services.dataset_service import DatasetService, DocumentService, SegmentSer
from services.entities.knowledge_entities.knowledge_entities import ChildChunkUpdateArgs, SegmentUpdateArgs
from services.errors.chunk import ChildChunkDeleteIndexError as ChildChunkDeleteIndexServiceError
from services.errors.chunk import ChildChunkIndexingError as ChildChunkIndexingServiceError
from services.summary_index_service import SummaryIndexService
from tasks.batch_create_segment_to_index_task import batch_create_segment_to_index_task
def _get_segment_with_summary(segment, dataset_id):
"""Helper function to marshal segment and add summary information."""
from services.summary_index_service import SummaryIndexService
segment_dict = dict(marshal(segment, segment_fields)) # type: ignore
# Query summary for this segment (only enabled summaries)
summary = SummaryIndexService.get_segment_summary(segment_id=segment.id, dataset_id=dataset_id)
segment_dict["summary"] = summary.summary_content if summary else None
return segment_dict
class SegmentListQuery(BaseModel):
limit: int = Field(default=20, ge=1, le=100)
status: list[str] = Field(default_factory=list)
@@ -67,6 +71,16 @@ class SegmentListQuery(BaseModel):
page: int = Field(default=1, ge=1)
class SegmentIdListQuery(BaseModel):
segment_id: list[str] = Field(default_factory=list, description="Segment IDs")
class ChildChunkListQuery(BaseModel):
limit: int = Field(default=20, ge=1, le=100)
keyword: str | None = None
page: int = Field(default=1, ge=1)
class SegmentCreatePayload(BaseModel):
content: str
answer: str | None = None
@@ -92,13 +106,35 @@ class SegmentBatchImportStatusResponse(ResponseModel):
job_status: str
class ConsoleSegmentListResponse(ResponseModel):
data: list[SegmentResponse]
limit: int
total: int
total_pages: int
page: int
class ChildChunkBatchUpdateResponse(ResponseModel):
data: list[ChildChunkResponse]
class ChildChunkBatchUpdatePayload(BaseModel):
chunks: list[ChildChunkUpdateArgs]
class SegmentDocParams:
DATASET_DOCUMENT = {"dataset_id": "Dataset ID", "document_id": "Document ID"}
DATASET_DOCUMENT_ACTION = {**DATASET_DOCUMENT, "action": "Action"}
DATASET_DOCUMENT_SEGMENT = {**DATASET_DOCUMENT, "segment_id": "Segment ID"}
DATASET_DOCUMENT_PARENT_SEGMENT = {**DATASET_DOCUMENT, "segment_id": "Parent segment ID"}
DATASET_DOCUMENT_CHILD_CHUNK = {**DATASET_DOCUMENT_PARENT_SEGMENT, "child_chunk_id": "Child chunk ID"}
register_schema_models(
console_ns,
SegmentListQuery,
SegmentIdListQuery,
ChildChunkListQuery,
SegmentCreatePayload,
SegmentUpdatePayload,
BatchImportPayload,
@@ -107,11 +143,24 @@ register_schema_models(
ChildChunkBatchUpdatePayload,
ChildChunkUpdateArgs,
)
register_response_schema_models(console_ns, SegmentBatchImportStatusResponse, SimpleResultResponse)
register_response_schema_models(
console_ns,
SegmentResponse,
ConsoleSegmentListResponse,
SegmentDetailResponse,
ChildChunkDetailResponse,
ChildChunkListResponse,
ChildChunkBatchUpdateResponse,
SegmentBatchImportStatusResponse,
SimpleResultResponse,
)
@console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments")
class DatasetDocumentSegmentListApi(Resource):
@console_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT)
@console_ns.doc(params=query_params_from_model(SegmentListQuery))
@console_ns.response(200, "Segments retrieved successfully", console_ns.models[ConsoleSegmentListResponse.__name__])
@setup_required
@login_required
@account_initialization_required
@@ -134,12 +183,7 @@ class DatasetDocumentSegmentListApi(Resource):
if not document:
raise NotFound("Document not found.")
args = SegmentListQuery.model_validate(
{
**request.args.to_dict(),
"status": request.args.getlist("status"),
}
)
args = query_params_from_request(SegmentListQuery, list_fields=("status",))
page = args.page
limit = min(args.limit, 100)
@@ -205,38 +249,30 @@ class DatasetDocumentSegmentListApi(Resource):
segments = db.paginate(select=query, page=page, per_page=limit, max_per_page=100, error_out=False)
# Query summaries for all segments in this page (batch query for efficiency)
segment_ids = [segment.id for segment in segments.items]
summaries = {}
segment_list = list(segments.items)
segment_ids = [segment.id for segment in segment_list]
summaries: dict[str, str | None] = {}
if segment_ids:
from services.summary_index_service import SummaryIndexService
summary_records = SummaryIndexService.get_segments_summaries(
segment_ids=segment_ids, dataset_id=dataset_id_str
)
# Only include enabled summaries (already filtered by service)
summaries = {chunk_id: summary.summary_content for chunk_id, summary in summary_records.items()}
# Add summary to each segment
segments_with_summary = []
for segment in segments.items:
segment_dict = dict(marshal(segment, segment_fields)) # type: ignore
segment_dict["summary"] = summaries.get(segment.id)
segments_with_summary.append(segment_dict)
response = {
"data": segments_with_summary,
"data": segment_responses_with_summaries(segment_list, summaries),
"limit": limit,
"total": segments.total,
"total_pages": segments.pages,
"page": page,
}
return response, 200
return dump_response(ConsoleSegmentListResponse, response), 200
@setup_required
@login_required
@account_initialization_required
@cloud_edition_billing_rate_limit_check("knowledge")
@console_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT)
@console_ns.doc(params=query_params_from_model(SegmentIdListQuery))
@console_ns.response(204, "Segments deleted successfully")
def delete(self, dataset_id: UUID, document_id: UUID):
current_user, _ = current_account_with_tenant()
@@ -268,6 +304,8 @@ class DatasetDocumentSegmentListApi(Resource):
@console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segment/<string:action>")
class DatasetDocumentSegmentApi(Resource):
@console_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_ACTION)
@console_ns.doc(params=query_params_from_model(SegmentIdListQuery))
@setup_required
@login_required
@account_initialization_required
@@ -321,11 +359,12 @@ class DatasetDocumentSegmentApi(Resource):
SegmentService.update_segments_status(segment_ids, action, dataset, document)
except Exception as e:
raise InvalidActionError(str(e))
return {"result": "success"}, 200
return dump_response(SimpleResultResponse, {"result": "success"}), 200
@console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segment")
class DatasetDocumentSegmentAddApi(Resource):
@console_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT)
@setup_required
@login_required
@account_initialization_required
@@ -333,6 +372,7 @@ class DatasetDocumentSegmentAddApi(Resource):
@cloud_edition_billing_knowledge_limit_check("add_segment")
@cloud_edition_billing_rate_limit_check("knowledge")
@console_ns.expect(console_ns.models[SegmentCreatePayload.__name__])
@console_ns.response(200, "Segment created successfully", console_ns.models[SegmentDetailResponse.__name__])
def post(self, dataset_id: UUID, document_id: UUID):
current_user, current_tenant_id = current_account_with_tenant()
@@ -372,18 +412,25 @@ class DatasetDocumentSegmentAddApi(Resource):
payload = SegmentCreatePayload.model_validate(console_ns.payload or {})
payload_dict = payload.model_dump(exclude_none=True)
SegmentService.segment_create_args_validate(payload_dict, document)
segment = SegmentService.create_segment(payload_dict, document, dataset)
return {"data": _get_segment_with_summary(segment, dataset_id_str), "doc_form": document.doc_form}, 200
segment = type_cast(DocumentSegment, SegmentService.create_segment(payload_dict, document, dataset))
summary = SummaryIndexService.get_segment_summary(segment_id=segment.id, dataset_id=dataset_id_str)
response = {
"data": segment_response_with_summary(segment, summary.summary_content if summary else None),
"doc_form": document.doc_form,
}
return dump_response(SegmentDetailResponse, response), 200
@console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments/<uuid:segment_id>")
class DatasetDocumentSegmentUpdateApi(Resource):
@console_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_SEGMENT)
@setup_required
@login_required
@account_initialization_required
@cloud_edition_billing_resource_check("vector_space")
@cloud_edition_billing_rate_limit_check("knowledge")
@console_ns.expect(console_ns.models[SegmentUpdatePayload.__name__])
@console_ns.response(200, "Segment updated successfully", console_ns.models[SegmentDetailResponse.__name__])
def patch(self, dataset_id: UUID, document_id: UUID, segment_id: UUID):
current_user, current_tenant_id = current_account_with_tenant()
@@ -440,12 +487,18 @@ class DatasetDocumentSegmentUpdateApi(Resource):
segment = SegmentService.update_segment(
SegmentUpdateArgs.model_validate(payload.model_dump(exclude_none=True)), segment, document, dataset
)
return {"data": _get_segment_with_summary(segment, dataset_id_str), "doc_form": document.doc_form}, 200
summary = SummaryIndexService.get_segment_summary(segment_id=segment.id, dataset_id=dataset_id_str)
response = {
"data": segment_response_with_summary(segment, summary.summary_content if summary else None),
"doc_form": document.doc_form,
}
return dump_response(SegmentDetailResponse, response), 200
@setup_required
@login_required
@account_initialization_required
@cloud_edition_billing_rate_limit_check("knowledge")
@console_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_SEGMENT)
@console_ns.response(204, "Segment deleted successfully")
def delete(self, dataset_id: UUID, document_id: UUID, segment_id: UUID):
current_user, current_tenant_id = current_account_with_tenant()
@@ -523,11 +576,11 @@ class DatasetDocumentSegmentBatchImportApi(Resource):
try:
# async job
job_id = str(uuid.uuid4())
indexing_cache_key = f"segment_batch_import_{str(job_id)}"
indexing_cache_key = f"segment_batch_import_{job_id}"
# send batch add segments task
redis_client.setnx(indexing_cache_key, "waiting")
batch_create_segment_to_index_task.delay(
str(job_id),
job_id,
upload_file_id,
dataset_id_str,
document_id_str,
@@ -536,7 +589,7 @@ class DatasetDocumentSegmentBatchImportApi(Resource):
)
except Exception as e:
return {"error": str(e)}, 500
return {"job_id": job_id, "job_status": "waiting"}, 200
return dump_response(SegmentBatchImportStatusResponse, {"job_id": job_id, "job_status": "waiting"}), 200
@console_ns.response(200, "Batch import status", console_ns.models[SegmentBatchImportStatusResponse.__name__])
@setup_required
@@ -551,11 +604,13 @@ class DatasetDocumentSegmentBatchImportApi(Resource):
if cache_result is None:
raise ValueError("The job does not exist.")
return {"job_id": job_id, "job_status": cache_result.decode()}, 200
response = {"job_id": job_id, "job_status": cache_result.decode()}
return dump_response(SegmentBatchImportStatusResponse, response), 200
@console_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments/<uuid:segment_id>/child_chunks")
class ChildChunkAddApi(Resource):
@console_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_PARENT_SEGMENT)
@setup_required
@login_required
@account_initialization_required
@@ -563,6 +618,7 @@ class ChildChunkAddApi(Resource):
@cloud_edition_billing_knowledge_limit_check("add_segment")
@cloud_edition_billing_rate_limit_check("knowledge")
@console_ns.expect(console_ns.models[ChildChunkCreatePayload.__name__])
@console_ns.response(200, "Child chunk created successfully", console_ns.models[ChildChunkDetailResponse.__name__])
def post(self, dataset_id: UUID, document_id: UUID, segment_id: UUID):
current_user, current_tenant_id = current_account_with_tenant()
@@ -613,8 +669,11 @@ class ChildChunkAddApi(Resource):
child_chunk = SegmentService.create_child_chunk(payload.content, segment, document, dataset)
except ChildChunkIndexingServiceError as e:
raise ChildChunkIndexingError(str(e))
return {"data": marshal(child_chunk, child_chunk_fields)}, 200
return dump_response(ChildChunkDetailResponse, {"data": child_chunk}), 200
@console_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_PARENT_SEGMENT)
@console_ns.doc(params=query_params_from_model(ChildChunkListQuery))
@console_ns.response(200, "Child chunks retrieved successfully", console_ns.models[ChildChunkListResponse.__name__])
@setup_required
@login_required
@account_initialization_required
@@ -642,13 +701,7 @@ class ChildChunkAddApi(Resource):
)
if not segment:
raise NotFound("Segment not found.")
args = SegmentListQuery.model_validate(
{
"limit": request.args.get("limit", default=20, type=int),
"keyword": request.args.get("keyword"),
"page": request.args.get("page", default=1, type=int),
}
)
args = query_params_from_request(ChildChunkListQuery, use_defaults_for_malformed_ints=True)
page = args.page
limit = min(args.limit, 100)
@@ -657,19 +710,27 @@ class ChildChunkAddApi(Resource):
child_chunks = SegmentService.get_child_chunks(
segment_id_str, document_id_str, dataset_id_str, page, limit, keyword
)
return {
"data": marshal(child_chunks.items, child_chunk_fields),
response = {
"data": child_chunks.items,
"total": child_chunks.total,
"total_pages": child_chunks.pages,
"page": page,
"limit": limit,
}, 200
}
return dump_response(ChildChunkListResponse, response), 200
@setup_required
@login_required
@account_initialization_required
@cloud_edition_billing_resource_check("vector_space")
@cloud_edition_billing_rate_limit_check("knowledge")
@console_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_PARENT_SEGMENT)
@console_ns.response(
200,
"Child chunks updated successfully",
console_ns.models[ChildChunkBatchUpdateResponse.__name__],
)
@console_ns.expect(console_ns.models[ChildChunkBatchUpdatePayload.__name__])
def patch(self, dataset_id: UUID, document_id: UUID, segment_id: UUID):
current_user, current_tenant_id = current_account_with_tenant()
@@ -707,7 +768,7 @@ class ChildChunkAddApi(Resource):
child_chunks = SegmentService.update_child_chunks(payload.chunks, segment, document, dataset)
except ChildChunkIndexingServiceError as e:
raise ChildChunkIndexingError(str(e))
return {"data": marshal(child_chunks, child_chunk_fields)}, 200
return dump_response(ChildChunkBatchUpdateResponse, {"data": child_chunks}), 200
@console_ns.route(
@@ -718,6 +779,7 @@ class ChildChunkUpdateApi(Resource):
@login_required
@account_initialization_required
@cloud_edition_billing_rate_limit_check("knowledge")
@console_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_CHILD_CHUNK)
@console_ns.response(204, "Child chunk deleted successfully")
def delete(self, dataset_id: UUID, document_id: UUID, segment_id: UUID, child_chunk_id: UUID):
current_user, current_tenant_id = current_account_with_tenant()
@@ -748,7 +810,7 @@ class ChildChunkUpdateApi(Resource):
child_chunk = db.session.scalar(
select(ChildChunk)
.where(
ChildChunk.id == str(child_chunk_id_str),
ChildChunk.id == child_chunk_id_str,
ChildChunk.tenant_id == current_tenant_id,
ChildChunk.segment_id == segment.id,
ChildChunk.document_id == document_id_str,
@@ -775,7 +837,9 @@ class ChildChunkUpdateApi(Resource):
@account_initialization_required
@cloud_edition_billing_resource_check("vector_space")
@cloud_edition_billing_rate_limit_check("knowledge")
@console_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_CHILD_CHUNK)
@console_ns.expect(console_ns.models[ChildChunkUpdatePayload.__name__])
@console_ns.response(200, "Child chunk updated successfully", console_ns.models[ChildChunkDetailResponse.__name__])
def patch(self, dataset_id: UUID, document_id: UUID, segment_id: UUID, child_chunk_id: UUID):
current_user, current_tenant_id = current_account_with_tenant()
@@ -805,7 +869,7 @@ class ChildChunkUpdateApi(Resource):
child_chunk = db.session.scalar(
select(ChildChunk)
.where(
ChildChunk.id == str(child_chunk_id_str),
ChildChunk.id == child_chunk_id_str,
ChildChunk.tenant_id == current_tenant_id,
ChildChunk.segment_id == segment.id,
ChildChunk.document_id == document_id_str,
@@ -827,4 +891,4 @@ class ChildChunkUpdateApi(Resource):
child_chunk = SegmentService.update_child_chunk(payload.content, child_chunk, segment, document, dataset)
except ChildChunkIndexingServiceError as e:
raise ChildChunkIndexingError(str(e))
return {"data": marshal(child_chunk, child_chunk_fields)}, 200
return dump_response(ChildChunkDetailResponse, {"data": child_chunk}), 200
+167 -106
View File
@@ -1,15 +1,18 @@
from typing import Any
from typing import cast
from uuid import UUID
from flask import request
from flask_restx import marshal
from pydantic import BaseModel, Field
from pydantic import BaseModel, Field, ValidationError, field_validator
from sqlalchemy import select
from werkzeug.exceptions import NotFound
from configs import dify_config
from controllers.common.controller_schemas import ChildChunkCreatePayload, ChildChunkUpdatePayload
from controllers.common.schema import register_schema_models
from controllers.common.schema import (
query_params_from_model,
query_params_from_request,
register_response_schema_models,
register_schema_models,
)
from controllers.service_api import service_api_ns
from controllers.service_api.app.error import ProviderNotInitializeError
from controllers.service_api.wraps import (
@@ -22,10 +25,19 @@ from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError
from core.model_manager import ModelManager
from core.rag.index_processor.constant.index_type import IndexTechniqueType
from extensions.ext_database import db
from fields.segment_fields import child_chunk_fields, segment_fields
from fields.base import ResponseModel
from fields.segment_fields import (
ChildChunkDetailResponse,
ChildChunkListResponse,
SegmentDetailResponse,
SegmentResponse,
segment_response_with_summary,
segment_responses_with_summaries,
)
from graphon.model_runtime.entities.model_entities import ModelType
from libs.helper import dump_response
from libs.login import current_account_with_tenant
from models.dataset import Dataset
from models.dataset import Dataset, DocumentSegment
from services.dataset_service import DatasetService, DocumentService, SegmentService
from services.entities.knowledge_entities.knowledge_entities import SegmentUpdateArgs
from services.errors.chunk import ChildChunkDeleteIndexError, ChildChunkIndexingError
@@ -34,35 +46,27 @@ from services.errors.chunk import ChildChunkIndexingError as ChildChunkIndexingS
from services.summary_index_service import SummaryIndexService
def _marshal_segment_with_summary(segment, dataset_id: str) -> dict[str, Any]:
"""Marshal a single segment and enrich it with summary content."""
segment_dict: dict[str, Any] = dict(marshal(segment, segment_fields)) # type: ignore[arg-type]
summary = SummaryIndexService.get_segment_summary(segment_id=segment.id, dataset_id=dataset_id)
segment_dict["summary"] = summary.summary_content if summary else None
return segment_dict
class SegmentCreateItemPayload(BaseModel):
content: str = Field(min_length=1)
answer: str | None = None
keywords: list[str] | None = None
attachment_ids: list[str] | None = None
def _marshal_segments_with_summary(segments, dataset_id: str) -> list[dict[str, Any]]:
"""Marshal multiple segments and enrich them with summary content (batch query)."""
segment_ids = [segment.id for segment in segments]
summaries: dict[str, str | None] = {}
if segment_ids:
summary_records = SummaryIndexService.get_segments_summaries(segment_ids=segment_ids, dataset_id=dataset_id)
summaries = {chunk_id: record.summary_content for chunk_id, record in summary_records.items()}
result: list[dict[str, Any]] = []
for segment in segments:
segment_dict: dict[str, Any] = dict(marshal(segment, segment_fields)) # type: ignore[arg-type]
segment_dict["summary"] = summaries.get(segment.id)
result.append(segment_dict)
return result
@field_validator("content")
@classmethod
def validate_content(cls, value: str) -> str:
if not value.strip():
raise ValueError("Content is empty")
return value
class SegmentCreatePayload(BaseModel):
segments: list[dict[str, Any]] | None = None
segments: list[SegmentCreateItemPayload] = Field(min_length=1)
class SegmentListQuery(BaseModel):
limit: int = Field(default=20, ge=1)
page: int = Field(default=1, ge=1)
status: list[str] = Field(default_factory=list)
keyword: str | None = None
@@ -77,9 +81,31 @@ class ChildChunkListQuery(BaseModel):
page: int = Field(default=1, ge=1)
class SegmentDocParams:
DATASET_DOCUMENT = {"dataset_id": "Dataset ID", "document_id": "Document ID"}
DATASET_DOCUMENT_SEGMENT = {**DATASET_DOCUMENT, "segment_id": "Segment ID"}
DATASET_DOCUMENT_PARENT_SEGMENT = {**DATASET_DOCUMENT, "segment_id": "Parent segment ID"}
DATASET_DOCUMENT_CHILD_CHUNK = {**DATASET_DOCUMENT_PARENT_SEGMENT, "child_chunk_id": "Child chunk ID"}
class SegmentCreateListResponse(ResponseModel):
data: list[SegmentResponse]
doc_form: str
class SegmentListResponse(ResponseModel):
data: list[SegmentResponse]
doc_form: str
total: int
has_more: bool
limit: int
page: int
register_schema_models(
service_api_ns,
SegmentCreatePayload,
SegmentCreateItemPayload,
SegmentListQuery,
SegmentUpdateArgs,
SegmentUpdatePayload,
@@ -87,6 +113,15 @@ register_schema_models(
ChildChunkListQuery,
ChildChunkUpdatePayload,
)
register_response_schema_models(
service_api_ns,
SegmentResponse,
SegmentCreateListResponse,
SegmentListResponse,
SegmentDetailResponse,
ChildChunkDetailResponse,
ChildChunkListResponse,
)
@service_api_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments")
@@ -96,7 +131,7 @@ class SegmentApi(DatasetApiResource):
@service_api_ns.expect(service_api_ns.models[SegmentCreatePayload.__name__])
@service_api_ns.doc("create_segments")
@service_api_ns.doc(description="Create segments in a document")
@service_api_ns.doc(params={"dataset_id": "Dataset ID", "document_id": "Document ID"})
@service_api_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT)
@service_api_ns.doc(
responses={
200: "Segments created successfully",
@@ -105,6 +140,11 @@ class SegmentApi(DatasetApiResource):
404: "Dataset or document not found",
}
)
@service_api_ns.response(
200,
"Segments created successfully",
service_api_ns.models[SegmentCreateListResponse.__name__],
)
@cloud_edition_billing_resource_check("vector_space", "dataset")
@cloud_edition_billing_knowledge_limit_check("add_segment", "dataset")
@cloud_edition_billing_rate_limit_check("knowledge", "dataset")
@@ -144,26 +184,35 @@ class SegmentApi(DatasetApiResource):
except ProviderTokenNotInitError as ex:
raise ProviderNotInitializeError(ex.description)
# validate args
payload = SegmentCreatePayload.model_validate(service_api_ns.payload or {})
if payload.segments is not None:
segments_limit = dify_config.DATASET_MAX_SEGMENTS_PER_REQUEST
if segments_limit > 0 and len(payload.segments) > segments_limit:
raise ValueError(f"Exceeded maximum segments limit of {segments_limit}.")
try:
payload = SegmentCreatePayload.model_validate(service_api_ns.payload or {})
except ValidationError as e:
return {"error": str(e)}, 400
segments_limit = dify_config.DATASET_MAX_SEGMENTS_PER_REQUEST
if segments_limit > 0 and len(payload.segments) > segments_limit:
raise ValueError(f"Exceeded maximum segments limit of {segments_limit}.")
segment_items = [segment.model_dump(exclude_none=True) for segment in payload.segments]
for args_item in payload.segments:
SegmentService.segment_create_args_validate(args_item, document)
segments = SegmentService.multi_create_segment(payload.segments, document, dataset)
return {
"data": _marshal_segments_with_summary(segments, dataset_id_str),
"doc_form": document.doc_form,
}, 200
else:
return {"error": "Segments is required"}, 400
for args_item in segment_items:
SegmentService.segment_create_args_validate(args_item, document)
segments = cast(list[DocumentSegment], SegmentService.multi_create_segment(segment_items, document, dataset))
segment_ids = [segment.id for segment in segments]
summaries: dict[str, str | None] = {}
if segment_ids:
summary_records = SummaryIndexService.get_segments_summaries(
segment_ids=segment_ids, dataset_id=dataset_id_str
)
summaries = {chunk_id: record.summary_content for chunk_id, record in summary_records.items()}
response = {
"data": segment_responses_with_summaries(segments, summaries),
"doc_form": document.doc_form,
}
return dump_response(SegmentCreateListResponse, response), 200
@service_api_ns.expect(service_api_ns.models[SegmentListQuery.__name__])
@service_api_ns.doc("list_segments")
@service_api_ns.doc(description="List segments in a document")
@service_api_ns.doc(params={"dataset_id": "Dataset ID", "document_id": "Document ID"})
@service_api_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT)
@service_api_ns.doc(params=query_params_from_model(SegmentListQuery))
@service_api_ns.doc(
responses={
200: "Segments retrieved successfully",
@@ -171,12 +220,22 @@ class SegmentApi(DatasetApiResource):
404: "Dataset or document not found",
}
)
@service_api_ns.response(
200,
"Segments retrieved successfully",
service_api_ns.models[SegmentListResponse.__name__],
)
def get(self, tenant_id: str, dataset_id: UUID, document_id: UUID):
_, current_tenant_id = current_account_with_tenant()
"""Get segments."""
# check dataset
page = request.args.get("page", default=1, type=int)
limit = request.args.get("limit", default=20, type=int)
args = query_params_from_request(
SegmentListQuery,
list_fields=("status",),
use_defaults_for_malformed_ints=True,
)
page = args.page
limit = args.limit
dataset_id_str = str(dataset_id)
dataset = db.session.scalar(
select(Dataset).where(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id_str).limit(1)
@@ -205,13 +264,6 @@ class SegmentApi(DatasetApiResource):
except ProviderTokenNotInitError as ex:
raise ProviderNotInitializeError(ex.description)
args = SegmentListQuery.model_validate(
{
"status": request.args.getlist("status"),
"keyword": request.args.get("keyword"),
}
)
segments, total = SegmentService.get_segments(
document_id=document_id_str,
tenant_id=current_tenant_id,
@@ -220,9 +272,16 @@ class SegmentApi(DatasetApiResource):
page=page,
limit=limit,
)
segment_ids = [segment.id for segment in segments]
summaries: dict[str, str | None] = {}
if segment_ids:
summary_records = SummaryIndexService.get_segments_summaries(
segment_ids=segment_ids, dataset_id=dataset_id_str
)
summaries = {chunk_id: record.summary_content for chunk_id, record in summary_records.items()}
response = {
"data": _marshal_segments_with_summary(segments, dataset_id_str),
"data": segment_responses_with_summaries(segments, summaries),
"doc_form": document.doc_form,
"total": total,
"has_more": len(segments) == limit,
@@ -230,16 +289,14 @@ class SegmentApi(DatasetApiResource):
"page": page,
}
return response, 200
return dump_response(SegmentListResponse, response), 200
@service_api_ns.route("/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments/<uuid:segment_id>")
class DatasetSegmentApi(DatasetApiResource):
@service_api_ns.doc("delete_segment")
@service_api_ns.doc(description="Delete a specific segment")
@service_api_ns.doc(
params={"dataset_id": "Dataset ID", "document_id": "Document ID", "segment_id": "Segment ID to delete"}
)
@service_api_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_SEGMENT)
@service_api_ns.doc(
responses={
204: "Segment deleted successfully",
@@ -275,9 +332,7 @@ class DatasetSegmentApi(DatasetApiResource):
@service_api_ns.expect(service_api_ns.models[SegmentUpdatePayload.__name__])
@service_api_ns.doc("update_segment")
@service_api_ns.doc(description="Update a specific segment")
@service_api_ns.doc(
params={"dataset_id": "Dataset ID", "document_id": "Document ID", "segment_id": "Segment ID to update"}
)
@service_api_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_SEGMENT)
@service_api_ns.doc(
responses={
200: "Segment updated successfully",
@@ -285,6 +340,7 @@ class DatasetSegmentApi(DatasetApiResource):
404: "Dataset, document, or segment not found",
}
)
@service_api_ns.response(200, "Segment updated successfully", service_api_ns.models[SegmentDetailResponse.__name__])
@cloud_edition_billing_resource_check("vector_space", "dataset")
@cloud_edition_billing_rate_limit_check("knowledge", "dataset")
def post(self, tenant_id: str, dataset_id: UUID, document_id: UUID, segment_id: UUID):
@@ -328,13 +384,16 @@ class DatasetSegmentApi(DatasetApiResource):
payload = SegmentUpdatePayload.model_validate(service_api_ns.payload or {})
updated_segment = SegmentService.update_segment(payload.segment, segment, document, dataset)
return {
"data": _marshal_segment_with_summary(updated_segment, dataset_id_str),
summary = SummaryIndexService.get_segment_summary(segment_id=updated_segment.id, dataset_id=dataset_id_str)
response = {
"data": segment_response_with_summary(updated_segment, summary.summary_content if summary else None),
"doc_form": document.doc_form,
}, 200
}
return dump_response(SegmentDetailResponse, response), 200
@service_api_ns.doc("get_segment")
@service_api_ns.doc(description="Get a specific segment by ID")
@service_api_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_SEGMENT)
@service_api_ns.doc(
responses={
200: "Segment retrieved successfully",
@@ -342,6 +401,11 @@ class DatasetSegmentApi(DatasetApiResource):
404: "Dataset, document, or segment not found",
}
)
@service_api_ns.response(
200,
"Segment retrieved successfully",
service_api_ns.models[SegmentDetailResponse.__name__],
)
def get(self, tenant_id: str, dataset_id: UUID, document_id: UUID, segment_id: UUID):
_, current_tenant_id = current_account_with_tenant()
dataset_id_str = str(dataset_id)
@@ -364,7 +428,12 @@ class DatasetSegmentApi(DatasetApiResource):
if not segment:
raise NotFound("Segment not found.")
return {"data": _marshal_segment_with_summary(segment, dataset_id_str), "doc_form": document.doc_form}, 200
summary = SummaryIndexService.get_segment_summary(segment_id=segment.id, dataset_id=dataset_id_str)
response = {
"data": segment_response_with_summary(segment, summary.summary_content if summary else None),
"doc_form": document.doc_form,
}
return dump_response(SegmentDetailResponse, response), 200
@service_api_ns.route(
@@ -376,9 +445,7 @@ class ChildChunkApi(DatasetApiResource):
@service_api_ns.expect(service_api_ns.models[ChildChunkCreatePayload.__name__])
@service_api_ns.doc("create_child_chunk")
@service_api_ns.doc(description="Create a new child chunk for a segment")
@service_api_ns.doc(
params={"dataset_id": "Dataset ID", "document_id": "Document ID", "segment_id": "Parent segment ID"}
)
@service_api_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_PARENT_SEGMENT)
@service_api_ns.doc(
responses={
200: "Child chunk created successfully",
@@ -386,6 +453,11 @@ class ChildChunkApi(DatasetApiResource):
404: "Dataset, document, or segment not found",
}
)
@service_api_ns.response(
200,
"Child chunk created successfully",
service_api_ns.models[ChildChunkDetailResponse.__name__],
)
@cloud_edition_billing_resource_check("vector_space", "dataset")
@cloud_edition_billing_knowledge_limit_check("add_segment", "dataset")
@cloud_edition_billing_rate_limit_check("knowledge", "dataset")
@@ -437,14 +509,12 @@ class ChildChunkApi(DatasetApiResource):
except ChildChunkIndexingServiceError as e:
raise ChildChunkIndexingError(str(e))
return {"data": marshal(child_chunk, child_chunk_fields)}, 200
return dump_response(ChildChunkDetailResponse, {"data": child_chunk}), 200
@service_api_ns.expect(service_api_ns.models[ChildChunkListQuery.__name__])
@service_api_ns.doc("list_child_chunks")
@service_api_ns.doc(description="List child chunks for a segment")
@service_api_ns.doc(
params={"dataset_id": "Dataset ID", "document_id": "Document ID", "segment_id": "Parent segment ID"}
)
@service_api_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_PARENT_SEGMENT)
@service_api_ns.doc(params=query_params_from_model(ChildChunkListQuery))
@service_api_ns.doc(
responses={
200: "Child chunks retrieved successfully",
@@ -452,6 +522,11 @@ class ChildChunkApi(DatasetApiResource):
404: "Dataset, document, or segment not found",
}
)
@service_api_ns.response(
200,
"Child chunks retrieved successfully",
service_api_ns.models[ChildChunkListResponse.__name__],
)
def get(self, tenant_id: str, dataset_id: UUID, document_id: UUID, segment_id: UUID):
_, current_tenant_id = current_account_with_tenant()
"""Get child chunks."""
@@ -475,13 +550,7 @@ class ChildChunkApi(DatasetApiResource):
if not segment:
raise NotFound("Segment not found.")
args = ChildChunkListQuery.model_validate(
{
"limit": request.args.get("limit", default=20, type=int),
"keyword": request.args.get("keyword"),
"page": request.args.get("page", default=1, type=int),
}
)
args = query_params_from_request(ChildChunkListQuery, use_defaults_for_malformed_ints=True)
page = args.page
limit = min(args.limit, 100)
@@ -491,13 +560,14 @@ class ChildChunkApi(DatasetApiResource):
segment_id_str, document_id_str, dataset_id_str, page, limit, keyword
)
return {
"data": marshal(child_chunks.items, child_chunk_fields),
response = {
"data": child_chunks.items,
"total": child_chunks.total,
"total_pages": child_chunks.pages,
"page": page,
"limit": limit,
}, 200
}
return dump_response(ChildChunkListResponse, response), 200
@service_api_ns.route(
@@ -508,14 +578,7 @@ class DatasetChildChunkApi(DatasetApiResource):
@service_api_ns.doc("delete_child_chunk")
@service_api_ns.doc(description="Delete a specific child chunk")
@service_api_ns.doc(
params={
"dataset_id": "Dataset ID",
"document_id": "Document ID",
"segment_id": "Parent segment ID",
"child_chunk_id": "Child chunk ID to delete",
}
)
@service_api_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_CHILD_CHUNK)
@service_api_ns.doc(
responses={
204: "Child chunk deleted successfully",
@@ -549,7 +612,7 @@ class DatasetChildChunkApi(DatasetApiResource):
raise NotFound("Segment not found.")
# validate segment belongs to the specified document
if str(segment.document_id) != str(document_id_str):
if segment.document_id != document_id_str:
raise NotFound("Document not found.")
child_chunk_id_str = str(child_chunk_id)
@@ -561,7 +624,7 @@ class DatasetChildChunkApi(DatasetApiResource):
raise NotFound("Child chunk not found.")
# validate child chunk belongs to the specified segment
if str(child_chunk.segment_id) != str(segment.id):
if child_chunk.segment_id != segment.id:
raise NotFound("Child chunk not found.")
try:
@@ -574,14 +637,7 @@ class DatasetChildChunkApi(DatasetApiResource):
@service_api_ns.expect(service_api_ns.models[ChildChunkUpdatePayload.__name__])
@service_api_ns.doc("update_child_chunk")
@service_api_ns.doc(description="Update a specific child chunk")
@service_api_ns.doc(
params={
"dataset_id": "Dataset ID",
"document_id": "Document ID",
"segment_id": "Parent segment ID",
"child_chunk_id": "Child chunk ID to update",
}
)
@service_api_ns.doc(params=SegmentDocParams.DATASET_DOCUMENT_CHILD_CHUNK)
@service_api_ns.doc(
responses={
200: "Child chunk updated successfully",
@@ -589,6 +645,11 @@ class DatasetChildChunkApi(DatasetApiResource):
404: "Dataset, document, segment, or child chunk not found",
}
)
@service_api_ns.response(
200,
"Child chunk updated successfully",
service_api_ns.models[ChildChunkDetailResponse.__name__],
)
@cloud_edition_billing_resource_check("vector_space", "dataset")
@cloud_edition_billing_knowledge_limit_check("add_segment", "dataset")
@cloud_edition_billing_rate_limit_check("knowledge", "dataset")
@@ -616,7 +677,7 @@ class DatasetChildChunkApi(DatasetApiResource):
raise NotFound("Segment not found.")
# validate segment belongs to the specified document
if str(segment.document_id) != str(document_id_str):
if segment.document_id != document_id_str:
raise NotFound("Segment not found.")
child_chunk_id_str = str(child_chunk_id)
@@ -628,7 +689,7 @@ class DatasetChildChunkApi(DatasetApiResource):
raise NotFound("Child chunk not found.")
# validate child chunk belongs to the specified segment
if str(child_chunk.segment_id) != str(segment.id):
if child_chunk.segment_id != segment.id:
raise NotFound("Child chunk not found.")
# validate args
@@ -639,4 +700,4 @@ class DatasetChildChunkApi(DatasetApiResource):
except ChildChunkIndexingServiceError as e:
raise ChildChunkIndexingError(str(e))
return {"data": marshal(child_chunk, child_chunk_fields)}, 200
return dump_response(ChildChunkDetailResponse, {"data": child_chunk}), 200
+105 -49
View File
@@ -1,53 +1,109 @@
from flask_restx import fields
from collections.abc import Iterable, Mapping
from dataclasses import dataclass
from datetime import datetime
from typing import Any
from libs.helper import TimestampField
from pydantic import field_serializer
child_chunk_fields = {
"id": fields.String,
"segment_id": fields.String,
"content": fields.String,
"position": fields.Integer,
"word_count": fields.Integer,
"type": fields.String,
"created_at": TimestampField,
"updated_at": TimestampField,
}
from fields.base import ResponseModel
from libs.helper import to_timestamp
attachment_fields = {
"id": fields.String,
"name": fields.String,
"size": fields.Integer,
"extension": fields.String,
"mime_type": fields.String,
"source_url": fields.String,
}
segment_fields = {
"id": fields.String,
"position": fields.Integer,
"document_id": fields.String,
"content": fields.String,
"sign_content": fields.String,
"answer": fields.String,
"word_count": fields.Integer,
"tokens": fields.Integer,
"keywords": fields.List(fields.String),
"index_node_id": fields.String,
"index_node_hash": fields.String,
"hit_count": fields.Integer,
"enabled": fields.Boolean,
"disabled_at": TimestampField,
"disabled_by": fields.String,
"status": fields.String,
"created_by": fields.String,
"created_at": TimestampField,
"updated_at": TimestampField,
"updated_by": fields.String,
"indexing_at": TimestampField,
"completed_at": TimestampField,
"error": fields.String,
"stopped_at": TimestampField,
"child_chunks": fields.List(fields.Nested(child_chunk_fields)),
"attachments": fields.List(fields.Nested(attachment_fields)),
"summary": fields.String, # Summary content for the segment
}
class SegmentAttachmentResponse(ResponseModel):
id: str
name: str
size: int
extension: str
mime_type: str | None
source_url: str
class ChildChunkResponse(ResponseModel):
id: str
segment_id: str
content: str
position: int
word_count: int
type: str
created_at: datetime | int
updated_at: datetime | int
@field_serializer("created_at", "updated_at")
def serialize_timestamp(self, value: datetime | int) -> int:
return to_timestamp(value)
class SegmentResponse(ResponseModel):
id: str
position: int
document_id: str
content: str
sign_content: str
answer: str | None
word_count: int
tokens: int
keywords: list[str] | None
index_node_id: str | None
index_node_hash: str | None
hit_count: int
enabled: bool
disabled_at: datetime | int | None
disabled_by: str | None
status: str
created_by: str
created_at: datetime | int
updated_at: datetime | int
updated_by: str | None
indexing_at: datetime | int | None
completed_at: datetime | int | None
error: str | None
stopped_at: datetime | int | None
child_chunks: list[ChildChunkResponse]
attachments: list[SegmentAttachmentResponse]
summary: str | None
@field_serializer("created_at", "updated_at")
def serialize_required_timestamp(self, value: datetime | int) -> int:
return to_timestamp(value)
@field_serializer("disabled_at", "indexing_at", "completed_at", "stopped_at")
def serialize_optional_timestamp(self, value: datetime | int | None) -> int | None:
return to_timestamp(value)
@dataclass(frozen=True)
class SegmentWithSummary:
segment: Any
summary: str | None
def __getattr__(self, name: str) -> Any:
return getattr(self.segment, name)
def segment_response_with_summary(segment: Any, summary: str | None) -> SegmentResponse:
response_source = SegmentWithSummary(segment=segment, summary=summary)
return SegmentResponse.model_validate(response_source, from_attributes=True)
def segment_responses_with_summaries(
segments: Iterable[Any],
summaries: Mapping[str, str | None],
) -> list[SegmentResponse]:
return [segment_response_with_summary(segment, summaries.get(segment.id)) for segment in segments]
class SegmentDetailResponse(ResponseModel):
data: SegmentResponse
doc_form: str
class ChildChunkDetailResponse(ResponseModel):
data: ChildChunkResponse
class ChildChunkListResponse(ResponseModel):
data: list[ChildChunkResponse]
total: int
total_pages: int
page: int
limit: int
+174 -53
View File
@@ -5175,15 +5175,15 @@ Update document processing status (pause/resume)
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| dataset_id | path | | Yes | string |
| document_id | path | | Yes | string |
| payload | body | | Yes | [SegmentCreatePayload](#segmentcreatepayload) |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
##### Responses
| Code | Description |
| ---- | ----------- |
| 200 | Success |
| Code | Description | Schema |
| ---- | ----------- | ------ |
| 200 | Segment created successfully | [SegmentDetailResponse](#segmentdetailresponse) |
### /datasets/{dataset_id}/documents/{document_id}/segment/{action}
@@ -5192,9 +5192,10 @@ Update document processing status (pause/resume)
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| action | path | | Yes | string |
| dataset_id | path | | Yes | string |
| document_id | path | | Yes | string |
| action | path | Action | Yes | string |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | query | Segment IDs | No | [ string ] |
##### Responses
@@ -5209,8 +5210,9 @@ Update document processing status (pause/resume)
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| dataset_id | path | | Yes | string |
| document_id | path | | Yes | string |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | query | Segment IDs | No | [ string ] |
##### Responses
@@ -5223,14 +5225,20 @@ Update document processing status (pause/resume)
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| dataset_id | path | | Yes | string |
| document_id | path | | Yes | string |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| enabled | query | | No | string |
| hit_count_gte | query | | No | integer |
| keyword | query | | No | string |
| limit | query | | No | integer |
| page | query | | No | integer |
| status | query | | No | [ string ] |
##### Responses
| Code | Description |
| ---- | ----------- |
| 200 | Success |
| Code | Description | Schema |
| ---- | ----------- | ------ |
| 200 | Segments retrieved successfully | [ConsoleSegmentListResponse](#consolesegmentlistresponse) |
### /datasets/{dataset_id}/documents/{document_id}/segments/batch_import
@@ -5270,9 +5278,9 @@ Update document processing status (pause/resume)
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| dataset_id | path | | Yes | string |
| document_id | path | | Yes | string |
| segment_id | path | | Yes | string |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | path | Segment ID | Yes | string |
##### Responses
@@ -5285,16 +5293,16 @@ Update document processing status (pause/resume)
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| dataset_id | path | | Yes | string |
| document_id | path | | Yes | string |
| segment_id | path | | Yes | string |
| payload | body | | Yes | [SegmentUpdatePayload](#segmentupdatepayload) |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | path | Segment ID | Yes | string |
##### Responses
| Code | Description |
| ---- | ----------- |
| 200 | Success |
| Code | Description | Schema |
| ---- | ----------- | ------ |
| 200 | Segment updated successfully | [SegmentDetailResponse](#segmentdetailresponse) |
### /datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks
@@ -5303,46 +5311,50 @@ Update document processing status (pause/resume)
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| dataset_id | path | | Yes | string |
| document_id | path | | Yes | string |
| segment_id | path | | Yes | string |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | path | Parent segment ID | Yes | string |
| keyword | query | | No | string |
| limit | query | | No | integer |
| page | query | | No | integer |
##### Responses
| Code | Description |
| ---- | ----------- |
| 200 | Success |
| Code | Description | Schema |
| ---- | ----------- | ------ |
| 200 | Child chunks retrieved successfully | [ChildChunkListResponse](#childchunklistresponse) |
#### PATCH
##### Parameters
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| dataset_id | path | | Yes | string |
| document_id | path | | Yes | string |
| segment_id | path | | Yes | string |
| payload | body | | Yes | [ChildChunkBatchUpdatePayload](#childchunkbatchupdatepayload) |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | path | Parent segment ID | Yes | string |
##### Responses
| Code | Description |
| ---- | ----------- |
| 200 | Success |
| Code | Description | Schema |
| ---- | ----------- | ------ |
| 200 | Child chunks updated successfully | [ChildChunkBatchUpdateResponse](#childchunkbatchupdateresponse) |
#### POST
##### Parameters
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| dataset_id | path | | Yes | string |
| document_id | path | | Yes | string |
| segment_id | path | | Yes | string |
| payload | body | | Yes | [ChildChunkCreatePayload](#childchunkcreatepayload) |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | path | Parent segment ID | Yes | string |
##### Responses
| Code | Description |
| ---- | ----------- |
| 200 | Success |
| Code | Description | Schema |
| ---- | ----------- | ------ |
| 200 | Child chunk created successfully | [ChildChunkDetailResponse](#childchunkdetailresponse) |
### /datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks/{child_chunk_id}
@@ -5351,10 +5363,10 @@ Update document processing status (pause/resume)
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| child_chunk_id | path | | Yes | string |
| dataset_id | path | | Yes | string |
| document_id | path | | Yes | string |
| segment_id | path | | Yes | string |
| child_chunk_id | path | Child chunk ID | Yes | string |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | path | Parent segment ID | Yes | string |
##### Responses
@@ -5367,17 +5379,17 @@ Update document processing status (pause/resume)
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| child_chunk_id | path | | Yes | string |
| dataset_id | path | | Yes | string |
| document_id | path | | Yes | string |
| segment_id | path | | Yes | string |
| payload | body | | Yes | [ChildChunkUpdatePayload](#childchunkupdatepayload) |
| child_chunk_id | path | Child chunk ID | Yes | string |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | path | Parent segment ID | Yes | string |
##### Responses
| Code | Description |
| ---- | ----------- |
| 200 | Success |
| Code | Description | Schema |
| ---- | ----------- | ------ |
| 200 | Child chunk updated successfully | [ChildChunkDetailResponse](#childchunkdetailresponse) |
### /datasets/{dataset_id}/documents/{document_id}/summary-status
@@ -11718,12 +11730,55 @@ Button styles for user actions.
| ---- | ---- | ----------- | -------- |
| chunks | [ [ChildChunkUpdateArgs](#childchunkupdateargs) ] | | Yes |
#### ChildChunkBatchUpdateResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| data | [ [ChildChunkResponse](#childchunkresponse) ] | | Yes |
#### ChildChunkCreatePayload
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| content | string | | Yes |
#### ChildChunkDetailResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| data | [ChildChunkResponse](#childchunkresponse) | | Yes |
#### ChildChunkListQuery
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| keyword | string | | No |
| limit | integer | | No |
| page | integer | | No |
#### ChildChunkListResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| data | [ [ChildChunkResponse](#childchunkresponse) ] | | Yes |
| limit | integer | | Yes |
| page | integer | | Yes |
| total | integer | | Yes |
| total_pages | integer | | Yes |
#### ChildChunkResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| content | string | | Yes |
| created_at | integer | | Yes |
| id | string | | Yes |
| position | integer | | Yes |
| segment_id | string | | Yes |
| type | string | | Yes |
| updated_at | integer | | Yes |
| word_count | integer | | Yes |
#### ChildChunkUpdateArgs
| Name | Type | Description | Required |
@@ -11861,6 +11916,16 @@ Condition detail
| page | integer | Page number | No |
| tag_ids | [ string ] | Filter by tag IDs | No |
#### ConsoleSegmentListResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| data | [ [SegmentResponse](#segmentresponse) ] | | Yes |
| limit | integer | | Yes |
| page | integer | | Yes |
| total | integer | | Yes |
| total_pages | integer | | Yes |
#### Conversation
| Name | Type | Description | Required |
@@ -14865,6 +14930,17 @@ Form input definition.
| last_id | string | | No |
| limit | integer | | No |
#### SegmentAttachmentResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| extension | string | | Yes |
| id | string | | Yes |
| mime_type | string | | Yes |
| name | string | | Yes |
| size | integer | | Yes |
| source_url | string | | Yes |
#### SegmentBatchImportStatusResponse
| Name | Type | Description | Required |
@@ -14881,6 +14957,19 @@ Form input definition.
| content | string | | Yes |
| keywords | [ string ] | | No |
#### SegmentDetailResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| data | [SegmentResponse](#segmentresponse) | | Yes |
| doc_form | string | | Yes |
#### SegmentIdListQuery
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| segment_id | [ string ] | Segment IDs | No |
#### SegmentListQuery
| Name | Type | Description | Required |
@@ -14892,6 +14981,38 @@ Form input definition.
| page | integer | | No |
| status | [ string ] | | No |
#### SegmentResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| answer | string | | Yes |
| attachments | [ [SegmentAttachmentResponse](#segmentattachmentresponse) ] | | Yes |
| child_chunks | [ [ChildChunkResponse](#childchunkresponse) ] | | Yes |
| completed_at | integer | | Yes |
| content | string | | Yes |
| created_at | integer | | Yes |
| created_by | string | | Yes |
| disabled_at | integer | | Yes |
| disabled_by | string | | Yes |
| document_id | string | | Yes |
| enabled | boolean | | Yes |
| error | string | | Yes |
| hit_count | integer | | Yes |
| id | string | | Yes |
| index_node_hash | string | | Yes |
| index_node_id | string | | Yes |
| indexing_at | integer | | Yes |
| keywords | [ string ] | | Yes |
| position | integer | | Yes |
| sign_content | string | | Yes |
| status | string | | Yes |
| stopped_at | integer | | Yes |
| summary | string | | Yes |
| tokens | integer | | Yes |
| updated_at | integer | | Yes |
| updated_by | string | | Yes |
| word_count | integer | | Yes |
#### SegmentUpdatePayload
| Name | Type | Description | Required |
+159 -46
View File
@@ -1064,17 +1064,20 @@ List segments in a document
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| payload | body | | Yes | [SegmentListQuery](#segmentlistquery) |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| keyword | query | | No | string |
| limit | query | | No | integer |
| page | query | | No | integer |
| status | query | | No | [ string ] |
##### Responses
| Code | Description |
| ---- | ----------- |
| 200 | Segments retrieved successfully |
| 401 | Unauthorized - invalid API token |
| 404 | Dataset or document not found |
| Code | Description | Schema |
| ---- | ----------- | ------ |
| 200 | Segments retrieved successfully | [SegmentListResponse](#segmentlistresponse) |
| 401 | Unauthorized - invalid API token | |
| 404 | Dataset or document not found | |
#### POST
##### Description
@@ -1091,12 +1094,12 @@ Create segments in a document
##### Responses
| Code | Description |
| ---- | ----------- |
| 200 | Segments created successfully |
| 400 | Bad request - segments data is missing |
| 401 | Unauthorized - invalid API token |
| 404 | Dataset or document not found |
| Code | Description | Schema |
| ---- | ----------- | ------ |
| 200 | Segments created successfully | [SegmentCreateListResponse](#segmentcreatelistresponse) |
| 400 | Bad request - segments data is missing | |
| 401 | Unauthorized - invalid API token | |
| 404 | Dataset or document not found | |
### /datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}
@@ -1111,7 +1114,7 @@ Delete a specific segment
| ---- | ---------- | ----------- | -------- | ------ |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | path | Segment ID to delete | Yes | string |
| segment_id | path | Segment ID | Yes | string |
##### Responses
@@ -1130,17 +1133,17 @@ Get a specific segment by ID
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| dataset_id | path | | Yes | string |
| document_id | path | | Yes | string |
| segment_id | path | | Yes | string |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | path | Segment ID | Yes | string |
##### Responses
| Code | Description |
| ---- | ----------- |
| 200 | Segment retrieved successfully |
| 401 | Unauthorized - invalid API token |
| 404 | Dataset, document, or segment not found |
| Code | Description | Schema |
| ---- | ----------- | ------ |
| 200 | Segment retrieved successfully | [SegmentDetailResponse](#segmentdetailresponse) |
| 401 | Unauthorized - invalid API token | |
| 404 | Dataset, document, or segment not found | |
#### POST
##### Description
@@ -1154,15 +1157,15 @@ Update a specific segment
| payload | body | | Yes | [SegmentUpdatePayload](#segmentupdatepayload) |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | path | Segment ID to update | Yes | string |
| segment_id | path | Segment ID | Yes | string |
##### Responses
| Code | Description |
| ---- | ----------- |
| 200 | Segment updated successfully |
| 401 | Unauthorized - invalid API token |
| 404 | Dataset, document, or segment not found |
| Code | Description | Schema |
| ---- | ----------- | ------ |
| 200 | Segment updated successfully | [SegmentDetailResponse](#segmentdetailresponse) |
| 401 | Unauthorized - invalid API token | |
| 404 | Dataset, document, or segment not found | |
### /datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks
@@ -1175,18 +1178,20 @@ List child chunks for a segment
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| payload | body | | Yes | [ChildChunkListQuery](#childchunklistquery) |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | path | Parent segment ID | Yes | string |
| keyword | query | | No | string |
| limit | query | | No | integer |
| page | query | | No | integer |
##### Responses
| Code | Description |
| ---- | ----------- |
| 200 | Child chunks retrieved successfully |
| 401 | Unauthorized - invalid API token |
| 404 | Dataset, document, or segment not found |
| Code | Description | Schema |
| ---- | ----------- | ------ |
| 200 | Child chunks retrieved successfully | [ChildChunkListResponse](#childchunklistresponse) |
| 401 | Unauthorized - invalid API token | |
| 404 | Dataset, document, or segment not found | |
#### POST
##### Description
@@ -1204,11 +1209,11 @@ Create a new child chunk for a segment
##### Responses
| Code | Description |
| ---- | ----------- |
| 200 | Child chunk created successfully |
| 401 | Unauthorized - invalid API token |
| 404 | Dataset, document, or segment not found |
| Code | Description | Schema |
| ---- | ----------- | ------ |
| 200 | Child chunk created successfully | [ChildChunkDetailResponse](#childchunkdetailresponse) |
| 401 | Unauthorized - invalid API token | |
| 404 | Dataset, document, or segment not found | |
### /datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks/{child_chunk_id}
@@ -1221,7 +1226,7 @@ Delete a specific child chunk
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| child_chunk_id | path | Child chunk ID to delete | Yes | string |
| child_chunk_id | path | Child chunk ID | Yes | string |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | path | Parent segment ID | Yes | string |
@@ -1244,18 +1249,18 @@ Update a specific child chunk
| Name | Located in | Description | Required | Schema |
| ---- | ---------- | ----------- | -------- | ------ |
| payload | body | | Yes | [ChildChunkUpdatePayload](#childchunkupdatepayload) |
| child_chunk_id | path | Child chunk ID to update | Yes | string |
| child_chunk_id | path | Child chunk ID | Yes | string |
| dataset_id | path | Dataset ID | Yes | string |
| document_id | path | Document ID | Yes | string |
| segment_id | path | Parent segment ID | Yes | string |
##### Responses
| Code | Description |
| ---- | ----------- |
| 200 | Child chunk updated successfully |
| 401 | Unauthorized - invalid API token |
| 404 | Dataset, document, segment, or child chunk not found |
| Code | Description | Schema |
| ---- | ----------- | ------ |
| 200 | Child chunk updated successfully | [ChildChunkDetailResponse](#childchunkdetailresponse) |
| 401 | Unauthorized - invalid API token | |
| 404 | Dataset, document, segment, or child chunk not found | |
### /datasets/{dataset_id}/documents/{document_id}/update-by-file
@@ -2222,6 +2227,12 @@ Returns a list of available models for the specified model type.
| ---- | ---- | ----------- | -------- |
| content | string | | Yes |
#### ChildChunkDetailResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| data | [ChildChunkResponse](#childchunkresponse) | | Yes |
#### ChildChunkListQuery
| Name | Type | Description | Required |
@@ -2230,6 +2241,29 @@ Returns a list of available models for the specified model type.
| limit | integer | | No |
| page | integer | | No |
#### ChildChunkListResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| data | [ [ChildChunkResponse](#childchunkresponse) ] | | Yes |
| limit | integer | | Yes |
| page | integer | | Yes |
| total | integer | | Yes |
| total_pages | integer | | Yes |
#### ChildChunkResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| content | string | | Yes |
| created_at | integer | | Yes |
| id | string | | Yes |
| position | integer | | Yes |
| segment_id | string | | Yes |
| type | string | | Yes |
| updated_at | integer | | Yes |
| word_count | integer | | Yes |
#### ChildChunkUpdatePayload
| Name | Type | Description | Required |
@@ -2954,19 +2988,98 @@ Metadata operation data
| segmentation | [Segmentation](#segmentation) | | No |
| subchunk_segmentation | [Segmentation](#segmentation) | | No |
#### SegmentAttachmentResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| extension | string | | Yes |
| id | string | | Yes |
| mime_type | string | | Yes |
| name | string | | Yes |
| size | integer | | Yes |
| source_url | string | | Yes |
#### SegmentCreateItemPayload
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| answer | string | | No |
| attachment_ids | [ string ] | | No |
| content | string | | Yes |
| keywords | [ string ] | | No |
#### SegmentCreateListResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| data | [ [SegmentResponse](#segmentresponse) ] | | Yes |
| doc_form | string | | Yes |
#### SegmentCreatePayload
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| segments | [ object ] | | No |
| segments | [ [SegmentCreateItemPayload](#segmentcreateitempayload) ] | | Yes |
#### SegmentDetailResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| data | [SegmentResponse](#segmentresponse) | | Yes |
| doc_form | string | | Yes |
#### SegmentListQuery
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| keyword | string | | No |
| limit | integer | | No |
| page | integer | | No |
| status | [ string ] | | No |
#### SegmentListResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| data | [ [SegmentResponse](#segmentresponse) ] | | Yes |
| doc_form | string | | Yes |
| has_more | boolean | | Yes |
| limit | integer | | Yes |
| page | integer | | Yes |
| total | integer | | Yes |
#### SegmentResponse
| Name | Type | Description | Required |
| ---- | ---- | ----------- | -------- |
| answer | string | | Yes |
| attachments | [ [SegmentAttachmentResponse](#segmentattachmentresponse) ] | | Yes |
| child_chunks | [ [ChildChunkResponse](#childchunkresponse) ] | | Yes |
| completed_at | integer | | Yes |
| content | string | | Yes |
| created_at | integer | | Yes |
| created_by | string | | Yes |
| disabled_at | integer | | Yes |
| disabled_by | string | | Yes |
| document_id | string | | Yes |
| enabled | boolean | | Yes |
| error | string | | Yes |
| hit_count | integer | | Yes |
| id | string | | Yes |
| index_node_hash | string | | Yes |
| index_node_id | string | | Yes |
| indexing_at | integer | | Yes |
| keywords | [ string ] | | Yes |
| position | integer | | Yes |
| sign_content | string | | Yes |
| status | string | | Yes |
| stopped_at | integer | | Yes |
| summary | string | | Yes |
| tokens | integer | | Yes |
| updated_at | integer | | Yes |
| updated_by | string | | Yes |
| word_count | integer | | Yes |
#### SegmentUpdateArgs
| Name | Type | Description | Required |
@@ -0,0 +1,97 @@
"""DB-backed integration tests for console dataset segment endpoints."""
from __future__ import annotations
from uuid import uuid4
from flask.testing import FlaskClient
from sqlalchemy.orm import Session
from core.rag.index_processor.constant.index_type import IndexStructureType, IndexTechniqueType
from models.dataset import Dataset, Document, DocumentSegment, DocumentSegmentSummary
from models.enums import DataSourceType, DocumentCreatedFrom, IndexingStatus, SegmentStatus, SummaryStatus
from tests.test_containers_integration_tests.controllers.console.helpers import (
authenticate_console_client,
create_console_account_and_tenant,
)
def test_list_segments_uses_real_db_query_and_console_response_shape(
test_client_with_containers: FlaskClient,
db_session_with_containers: Session,
) -> None:
account, tenant = create_console_account_and_tenant(db_session_with_containers)
dataset = Dataset(
tenant_id=tenant.id,
name=f"Console Segment Dataset {uuid4()}",
description="Console segment integration dataset",
data_source_type=DataSourceType.UPLOAD_FILE,
indexing_technique=IndexTechniqueType.ECONOMY,
created_by=account.id,
permission="only_me",
provider="vendor",
)
db_session_with_containers.add(dataset)
db_session_with_containers.commit()
document = Document(
tenant_id=tenant.id,
dataset_id=dataset.id,
position=1,
data_source_type=DataSourceType.UPLOAD_FILE,
batch=f"batch-{uuid4()}",
name="console-segment-doc.txt",
created_from=DocumentCreatedFrom.WEB,
created_by=account.id,
enabled=True,
archived=False,
indexing_status=IndexingStatus.COMPLETED,
doc_form=IndexStructureType.PARAGRAPH_INDEX,
word_count=3,
tokens=4,
)
db_session_with_containers.add(document)
db_session_with_containers.commit()
segment = DocumentSegment(
tenant_id=tenant.id,
dataset_id=dataset.id,
document_id=document.id,
position=1,
content="Console integration segment",
word_count=3,
tokens=4,
keywords=["console", "integration"],
status=SegmentStatus.COMPLETED,
created_by=account.id,
)
db_session_with_containers.add(segment)
db_session_with_containers.commit()
segment_id = segment.id
db_session_with_containers.add(
DocumentSegmentSummary(
dataset_id=dataset.id,
document_id=document.id,
chunk_id=segment.id,
summary_content="Console DB summary",
status=SummaryStatus.COMPLETED,
)
)
db_session_with_containers.commit()
response = test_client_with_containers.get(
f"/console/api/datasets/{dataset.id}/documents/{document.id}/segments"
"?page=1&limit=10&status=completed&keyword=integration&enabled=all",
headers=authenticate_console_client(test_client_with_containers, account),
)
assert response.status_code == 200
body = response.get_json()
assert set(body) == {"data", "limit", "total", "total_pages", "page"}
assert body["limit"] == 10
assert body["total"] == 1
assert body["total_pages"] == 1
assert "has_more" not in body
assert body["data"][0]["id"] == segment_id
assert body["data"][0]["summary"] == "Console DB summary"
@@ -0,0 +1,153 @@
"""DB-backed integration tests for service API dataset segment endpoints."""
from __future__ import annotations
from uuid import uuid4
from flask.testing import FlaskClient
from sqlalchemy.orm import Session
from core.rag.index_processor.constant.index_type import IndexStructureType, IndexTechniqueType
from models.dataset import ChildChunk, Dataset, Document, DocumentSegment, DocumentSegmentSummary
from models.enums import (
ApiTokenType,
DataSourceType,
DocumentCreatedFrom,
IndexingStatus,
SegmentStatus,
SegmentType,
SummaryStatus,
)
from models.model import ApiToken
from tests.test_containers_integration_tests.controllers.console.helpers import create_console_account_and_tenant
def _create_dataset_graph(db_session: Session) -> tuple[Dataset, Document, DocumentSegment]:
account, tenant = create_console_account_and_tenant(db_session)
dataset = Dataset(
tenant_id=tenant.id,
name=f"Segment Dataset {uuid4()}",
description="Segment integration dataset",
data_source_type=DataSourceType.UPLOAD_FILE,
indexing_technique=IndexTechniqueType.ECONOMY,
created_by=account.id,
permission="only_me",
provider="vendor",
enable_api=True,
)
db_session.add(dataset)
db_session.commit()
document = Document(
tenant_id=tenant.id,
dataset_id=dataset.id,
position=1,
data_source_type=DataSourceType.UPLOAD_FILE,
batch=f"batch-{uuid4()}",
name="segment-doc.txt",
created_from=DocumentCreatedFrom.API,
created_by=account.id,
enabled=True,
archived=False,
indexing_status=IndexingStatus.COMPLETED,
doc_form=IndexStructureType.PARAGRAPH_INDEX,
word_count=4,
tokens=5,
)
db_session.add(document)
db_session.commit()
segment = DocumentSegment(
tenant_id=tenant.id,
dataset_id=dataset.id,
document_id=document.id,
position=1,
content="Segment content for integration",
word_count=4,
tokens=5,
keywords=["segment", "integration"],
status=SegmentStatus.COMPLETED,
created_by=account.id,
)
db_session.add(segment)
db_session.commit()
summary = DocumentSegmentSummary(
dataset_id=dataset.id,
document_id=document.id,
chunk_id=segment.id,
summary_content="DB summary",
status=SummaryStatus.COMPLETED,
)
db_session.add(summary)
api_token = ApiToken(
tenant_id=tenant.id,
type=ApiTokenType.DATASET,
token=f"dataset-{uuid4().hex}",
)
db_session.add(api_token)
db_session.commit()
return dataset, document, segment
def _auth_headers(db_session: Session, dataset: Dataset) -> dict[str, str]:
token = db_session.query(ApiToken).filter_by(tenant_id=dataset.tenant_id, type=ApiTokenType.DATASET).one()
return {"Authorization": f"Bearer {token.token}"}
def test_list_segments_uses_real_services_and_service_api_shape(
test_client_with_containers: FlaskClient,
db_session_with_containers: Session,
) -> None:
dataset, document, segment = _create_dataset_graph(db_session_with_containers)
segment_id = segment.id
response = test_client_with_containers.get(
f"/v1/datasets/{dataset.id}/documents/{document.id}/segments"
"?page=1&limit=20&status=completed&keyword=integration",
headers=_auth_headers(db_session_with_containers, dataset),
)
assert response.status_code == 200
body = response.get_json()
assert set(body) == {"data", "doc_form", "total", "has_more", "limit", "page"}
assert body["doc_form"] == "text_model"
assert body["total"] == 1
assert "total_pages" not in body
assert body["data"][0]["id"] == segment_id
assert body["data"][0]["summary"] == "DB summary"
assert body["data"][0]["attachments"] == []
assert body["data"][0]["child_chunks"] == []
def test_list_child_chunks_uses_real_segment_service(
test_client_with_containers: FlaskClient,
db_session_with_containers: Session,
) -> None:
dataset, document, segment = _create_dataset_graph(db_session_with_containers)
child_chunk = ChildChunk(
tenant_id=dataset.tenant_id,
dataset_id=dataset.id,
document_id=document.id,
segment_id=segment.id,
position=1,
content="Child integration content",
word_count=3,
type=SegmentType.CUSTOMIZED,
created_by=document.created_by,
)
db_session_with_containers.add(child_chunk)
db_session_with_containers.commit()
response = test_client_with_containers.get(
f"/v1/datasets/{dataset.id}/documents/{document.id}/segments/{segment.id}/child_chunks"
"?page=1&limit=20&keyword=integration",
headers=_auth_headers(db_session_with_containers, dataset),
)
assert response.status_code == 200
body = response.get_json()
assert set(body) == {"data", "total", "total_pages", "page", "limit"}
assert body["total"] == 1
assert body["data"][0]["content"] == "Child integration content"
@@ -4,6 +4,7 @@ from typing import Literal
from unittest.mock import MagicMock, patch
import pytest
from flask import Flask
from flask_restx import Namespace
from pydantic import BaseModel, ConfigDict, Field
@@ -47,6 +48,13 @@ class QueryModel(BaseModel):
ambiguous: int | str | None = Field(default=None, description="Ambiguous query parameter")
class HelperQueryModel(BaseModel):
page: int = 1
limit: int = 20
status: list[str] = Field(default_factory=list)
keyword: str | None = None
class NullableSchemaModel(BaseModel):
name: str | None = None
tags: list[str] | None = None
@@ -320,3 +328,41 @@ def test_query_params_from_model_builds_flask_restx_doc_params():
"required": False,
"description": "Ambiguous query parameter",
}
def test_query_params_from_request_preserves_repeated_list_params():
from controllers.common.schema import query_params_from_request
app = Flask(__name__)
with app.test_request_context("/?page=2&limit=30&status=active&status=inactive&keyword=hello"):
query = query_params_from_request(HelperQueryModel, list_fields=("status",))
assert query.page == 2
assert query.limit == 30
assert query.status == ["active", "inactive"]
assert query.keyword == "hello"
def test_query_params_from_request_raises_for_malformed_ints_by_default():
from controllers.common.schema import query_params_from_request
app = Flask(__name__)
with app.test_request_context("/?page=bad&limit="):
with pytest.raises(ValueError):
query_params_from_request(HelperQueryModel, list_fields=("status",))
def test_query_params_from_request_can_use_model_default_for_malformed_defaulted_ints():
from controllers.common.schema import query_params_from_request
app = Flask(__name__)
with app.test_request_context("/?page=bad&limit="):
query = query_params_from_request(
HelperQueryModel,
list_fields=("status",),
use_defaults_for_malformed_ints=True,
)
assert query.page == 1
assert query.limit == 20
assert query.status == []
@@ -10,13 +10,13 @@ from controllers.console import console_ns
from controllers.console.app.error import ProviderNotInitializeError
from controllers.console.datasets.datasets_segments import (
ChildChunkAddApi,
ChildChunkBatchUpdatePayload,
ChildChunkUpdateApi,
DatasetDocumentSegmentAddApi,
DatasetDocumentSegmentApi,
DatasetDocumentSegmentBatchImportApi,
DatasetDocumentSegmentListApi,
DatasetDocumentSegmentUpdateApi,
_get_segment_with_summary,
)
from controllers.console.datasets.error import (
ChildChunkDeleteIndexError,
@@ -25,9 +25,13 @@ from controllers.console.datasets.error import (
)
from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError
from core.rag.index_processor.constant.index_type import IndexStructureType
from fields.segment_fields import segment_response_with_summary
from libs.datetime_utils import naive_utc_now
from models.dataset import ChildChunk, DocumentSegment
from models.enums import SegmentStatus, SegmentType
from models.model import UploadFile
from services.errors.chunk import ChildChunkDeleteIndexError as ChildChunkDeleteIndexServiceError
from services.errors.chunk import ChildChunkIndexingError as ChildChunkIndexingServiceError
def unwrap(func):
@@ -37,49 +41,89 @@ def unwrap(func):
def _segment():
return SimpleNamespace(
id="s1",
segment = DocumentSegment(
tenant_id="tenant-1",
dataset_id="ds-1",
document_id="doc-1",
position=1,
document_id="d1",
content="c",
sign_content="c",
answer="a",
word_count=1,
tokens=1,
keywords=[],
index_node_id="n1",
index_node_hash="h",
hit_count=0,
enabled=True,
disabled_at=None,
disabled_by=None,
status="normal",
created_by="u1",
created_at=naive_utc_now(),
updated_at=naive_utc_now(),
updated_by="u1",
indexing_at=None,
completed_at=None,
error=None,
stopped_at=None,
child_chunks=[],
attachments=[],
summary=None,
)
segment.id = "seg-1"
segment.answer = "a"
segment.keywords = ["test"]
segment.index_node_id = "n1"
segment.index_node_hash = "h"
segment.status = SegmentStatus.COMPLETED
segment.created_at = naive_utc_now()
segment.updated_at = naive_utc_now()
segment.updated_by = "u1"
return segment
def test_get_segment_with_summary(monkeypatch: pytest.MonkeyPatch):
def _child_chunk():
child_chunk = ChildChunk(
tenant_id="tenant-1",
dataset_id="ds-1",
document_id="doc-1",
segment_id="seg-1",
position=1,
content="child",
word_count=1,
created_by="u1",
)
child_chunk.id = "cc-1"
child_chunk.type = SegmentType.CUSTOMIZED
child_chunk.created_at = naive_utc_now()
child_chunk.updated_at = naive_utc_now()
return child_chunk
def _segment_response_dict():
return {
"id": "seg-1",
"position": 1,
"document_id": "doc-1",
"content": "c",
"sign_content": "c",
"answer": "a",
"word_count": 1,
"tokens": 1,
"keywords": ["test"],
"index_node_id": "n1",
"index_node_hash": "h",
"hit_count": 0,
"enabled": True,
"disabled_at": None,
"disabled_by": None,
"status": "completed",
"created_by": "u1",
"created_at": 1779678000,
"updated_at": 1779678000,
"updated_by": "u1",
"indexing_at": None,
"completed_at": None,
"error": None,
"stopped_at": None,
"child_chunks": [],
"attachments": [],
"summary": None,
}
def test_segment_response_with_summary():
segment = _segment()
summary = SimpleNamespace(summary_content="summary")
monkeypatch.setattr(
"services.summary_index_service.SummaryIndexService.get_segment_summary",
lambda *_args, **_kwargs: summary,
)
with (
patch("models.dataset.db.session.scalar", return_value=None),
patch("models.dataset.db.session.execute", return_value=MagicMock(all=MagicMock(return_value=[]))),
):
result = segment_response_with_summary(segment, "summary")
result = _get_segment_with_summary(segment, dataset_id="d1")
assert result["summary"] == "summary"
assert result.summary == "summary"
assert result.id == segment.id
class TestDatasetDocumentSegmentListApi:
@@ -90,8 +134,7 @@ class TestDatasetDocumentSegmentListApi:
dataset = MagicMock()
document = MagicMock()
segment = MagicMock(spec=DocumentSegment)
segment.id = "seg-1"
segment = _segment()
pagination = MagicMock()
pagination.items = [segment]
@@ -124,10 +167,8 @@ class TestDatasetDocumentSegmentListApi:
"services.summary_index_service.SummaryIndexService.get_segments_summaries",
return_value={},
),
patch(
"controllers.console.datasets.datasets_segments.marshal",
return_value={"id": "seg-1"},
),
patch("models.dataset.db.session.scalar", return_value=None),
patch("models.dataset.db.session.execute", return_value=MagicMock(all=MagicMock(return_value=[]))),
):
response, status = method(api, "ds-1", "doc-1")
@@ -370,8 +411,7 @@ class TestDatasetDocumentSegmentAddApi:
document = MagicMock()
document.doc_form = IndexStructureType.PARAGRAPH_INDEX
segment = MagicMock()
segment.id = "seg-1"
segment = _segment()
with (
app.test_request_context("/", json=payload),
@@ -401,13 +441,11 @@ class TestDatasetDocumentSegmentAddApi:
return_value=segment,
),
patch(
"controllers.console.datasets.datasets_segments.marshal",
return_value={"id": "seg-1"},
),
patch(
"controllers.console.datasets.datasets_segments._get_segment_with_summary",
return_value={"id": "seg-1"},
"controllers.console.datasets.datasets_segments.SummaryIndexService.get_segment_summary",
return_value=None,
),
patch("models.dataset.db.session.scalar", return_value=None),
patch("models.dataset.db.session.execute", return_value=MagicMock(all=MagicMock(return_value=[]))),
):
response, status = method(api, "ds-1", "doc-1")
@@ -509,7 +547,7 @@ class TestDatasetDocumentSegmentUpdateApi:
document = MagicMock()
document.doc_form = IndexStructureType.PARAGRAPH_INDEX
segment = MagicMock()
segment = _segment()
with (
app.test_request_context("/", json=payload),
@@ -528,7 +566,7 @@ class TestDatasetDocumentSegmentUpdateApi:
),
patch(
"controllers.console.datasets.datasets_segments.db.session.scalar",
return_value=segment,
side_effect=[segment, None],
),
patch(
"controllers.console.datasets.datasets_segments.DatasetService.check_dataset_permission",
@@ -543,9 +581,10 @@ class TestDatasetDocumentSegmentUpdateApi:
return_value=segment,
),
patch(
"controllers.console.datasets.datasets_segments._get_segment_with_summary",
return_value={"id": "seg-1"},
"controllers.console.datasets.datasets_segments.SummaryIndexService.get_segment_summary",
return_value=None,
),
patch("models.dataset.db.session.execute", return_value=MagicMock(all=MagicMock(return_value=[]))),
):
response, status = method(api, "ds-1", "doc-1", "seg-1")
@@ -800,6 +839,52 @@ class TestDatasetDocumentSegmentBatchImportApi:
class TestChildChunkAddApi:
def test_patch_documents_batch_update_payload(self):
api_doc = unwrap(ChildChunkAddApi.patch).__apidoc__
expected_model = ChildChunkBatchUpdatePayload.__name__
assert [model.name for model in api_doc["expect"]] == [expected_model]
def test_get_uses_default_pagination_for_malformed_ints(self, app: Flask):
api = ChildChunkAddApi()
method = unwrap(api.get)
pagination = MagicMock(items=[], total=0, pages=0)
with (
app.test_request_context("/?page=bad&limit="),
patch(
"controllers.console.datasets.datasets_segments.current_account_with_tenant",
return_value=(MagicMock(), "tenant-1"),
),
patch(
"controllers.console.datasets.datasets_segments.DatasetService.get_dataset",
return_value=MagicMock(),
),
patch(
"controllers.console.datasets.datasets_segments.DatasetService.check_dataset_model_setting",
return_value=None,
),
patch(
"controllers.console.datasets.datasets_segments.DocumentService.get_document",
return_value=MagicMock(),
),
patch(
"controllers.console.datasets.datasets_segments.db.session.scalar",
return_value=MagicMock(),
),
patch(
"controllers.console.datasets.datasets_segments.SegmentService.get_child_chunks",
return_value=pagination,
) as get_child_chunks,
):
response, status = method(api, "ds-1", "doc-1", "seg-1")
assert status == 200
assert response["page"] == 1
assert response["limit"] == 20
get_child_chunks.assert_called_once_with("seg-1", "doc-1", "ds-1", 1, 20, None)
def test_post_success(self, app: Flask):
api = ChildChunkAddApi()
method = unwrap(api.post)
@@ -814,7 +899,7 @@ class TestChildChunkAddApi:
document = MagicMock()
segment = MagicMock()
child_chunk = MagicMock(spec=ChildChunk)
child_chunk = _child_chunk()
with (
app.test_request_context("/", json=payload),
@@ -843,10 +928,6 @@ class TestChildChunkAddApi:
"controllers.console.datasets.datasets_segments.SegmentService.create_child_chunk",
return_value=child_chunk,
),
patch(
"controllers.console.datasets.datasets_segments.marshal",
return_value={"id": "cc-1"},
),
):
response, status = method(api, "ds-1", "doc-1", "seg-1")
@@ -890,7 +971,7 @@ class TestChildChunkAddApi:
),
patch(
"controllers.console.datasets.datasets_segments.SegmentService.create_child_chunk",
side_effect=services.errors.chunk.ChildChunkIndexingError("fail"),
side_effect=ChildChunkIndexingServiceError("fail"),
),
):
with pytest.raises(ChildChunkIndexingError):
@@ -977,7 +1058,7 @@ class TestChildChunkUpdateApi:
),
patch(
"controllers.console.datasets.datasets_segments.SegmentService.delete_child_chunk",
side_effect=services.errors.chunk.ChildChunkDeleteIndexError("fail"),
side_effect=ChildChunkDeleteIndexServiceError("fail"),
),
):
with pytest.raises(ChildChunkDeleteIndexError):
@@ -992,10 +1073,7 @@ class TestSegmentListAdvancedCases:
dataset = MagicMock()
document = MagicMock()
segment = MagicMock(spec=DocumentSegment)
segment.id = "seg-1"
segment.keywords = ["test"]
segment.enabled = True
segment = _segment()
pagination = MagicMock(items=[segment], total=1, pages=1)
@@ -1025,6 +1103,8 @@ class TestSegmentListAdvancedCases:
"services.summary_index_service.SummaryIndexService.get_segments_summaries",
return_value={},
),
patch("models.dataset.db.session.scalar", return_value=None),
patch("models.dataset.db.session.execute", return_value=MagicMock(all=MagicMock(return_value=[]))),
):
result = method(api, "ds-1", "doc-1")
@@ -29,15 +29,67 @@ from controllers.service_api.dataset.segment import (
DatasetChildChunkApi,
DatasetSegmentApi,
SegmentApi,
SegmentCreateItemPayload,
SegmentCreatePayload,
SegmentListQuery,
)
from core.rag.index_processor.constant.index_type import IndexStructureType
from libs.datetime_utils import naive_utc_now
from models.dataset import ChildChunk, Dataset, Document, DocumentSegment
from models.enums import IndexingStatus
from models.enums import IndexingStatus, SegmentType
from services.dataset_service import DocumentService, SegmentService
def _segment_response_dict(summary: str | None = None):
return {
"id": "seg-1",
"position": 1,
"document_id": "doc-id",
"content": "segment content",
"sign_content": "segment content",
"answer": None,
"word_count": 2,
"tokens": 3,
"keywords": ["segment"],
"index_node_id": None,
"index_node_hash": None,
"hit_count": 0,
"enabled": True,
"disabled_at": None,
"disabled_by": None,
"status": "completed",
"created_by": "account-1",
"created_at": 1779678000,
"updated_at": 1779678000,
"updated_by": None,
"indexing_at": None,
"completed_at": None,
"error": None,
"stopped_at": None,
"child_chunks": [],
"attachments": [],
"summary": summary,
}
def _child_chunk() -> ChildChunk:
child_chunk = ChildChunk(
tenant_id="tenant-1",
dataset_id="dataset-1",
document_id="doc-id",
segment_id="seg-id",
position=1,
content="child chunk content",
word_count=3,
created_by="account-1",
)
child_chunk.id = "child-1"
child_chunk.type = SegmentType.CUSTOMIZED
child_chunk.created_at = naive_utc_now()
child_chunk.updated_at = naive_utc_now()
return child_chunk
class TestSegmentCreatePayload:
"""Test suite for SegmentCreatePayload Pydantic model."""
@@ -48,18 +100,34 @@ class TestSegmentCreatePayload:
{"content": "Second segment", "keywords": ["key1", "key2"]},
]
payload = SegmentCreatePayload(segments=segments)
assert payload.segments == segments
assert payload.segments is not None
assert [segment.model_dump(exclude_none=True) for segment in payload.segments] == segments
assert len(payload.segments) == 2
def test_payload_with_none_segments(self):
"""Test payload with None segments (should be valid)."""
payload = SegmentCreatePayload(segments=None)
assert payload.segments is None
"""Test payload with None segments is rejected."""
with pytest.raises(ValueError):
SegmentCreatePayload.model_validate({"segments": None})
def test_payload_with_empty_segments(self):
"""Test payload with empty segments list."""
payload = SegmentCreatePayload(segments=[])
assert payload.segments == []
"""Test payload with empty segments list is rejected."""
with pytest.raises(ValueError):
SegmentCreatePayload.model_validate({"segments": []})
def test_payload_requires_segments(self):
"""Test payload requires a segments field."""
with pytest.raises(ValueError):
SegmentCreatePayload.model_validate({})
def test_payload_rejects_segment_without_content(self):
"""Test each segment requires content."""
with pytest.raises(ValueError):
SegmentCreatePayload.model_validate({"segments": [{"answer": "Answer only"}]})
def test_payload_rejects_blank_content(self):
"""Test content cannot be whitespace-only."""
with pytest.raises(ValueError):
SegmentCreateItemPayload.model_validate({"content": " "})
def test_payload_with_complex_segment_data(self):
"""Test payload with complex segment structure."""
@@ -72,8 +140,9 @@ class TestSegmentCreatePayload:
}
]
payload = SegmentCreatePayload(segments=segments)
assert payload.segments[0]["content"] == "Complex segment"
assert payload.segments[0]["keywords"] == ["keyword1", "keyword2"]
assert payload.segments is not None
assert payload.segments[0].content == "Complex segment"
assert payload.segments[0].keywords == ["keyword1", "keyword2"]
class TestSegmentListQuery:
@@ -117,7 +186,7 @@ class TestChildChunkCreatePayload:
def test_payload_requires_content(self):
"""Test that content is required."""
with pytest.raises(ValueError):
ChildChunkCreatePayload()
ChildChunkCreatePayload.model_validate({})
def test_payload_with_long_content(self):
"""Test payload with very long content."""
@@ -157,12 +226,12 @@ class TestChildChunkListQuery:
def test_query_limit_minimum(self):
"""Test query limit minimum validation."""
with pytest.raises(ValueError):
ChildChunkListQuery(limit=0)
ChildChunkListQuery.model_validate({"limit": 0})
def test_query_page_minimum(self):
"""Test query page minimum validation."""
with pytest.raises(ValueError):
ChildChunkListQuery(page=0)
ChildChunkListQuery.model_validate({"page": 0})
def test_query_with_keyword(self):
"""Test query with keyword filter."""
@@ -292,6 +361,7 @@ class TestSegmentServiceMockedBehavior:
segments=[{"content": "Test"}, {"content": "Test 2"}], document=mock_document, dataset=mock_dataset
)
assert result is not None
assert len(result) == 2
mock_create.assert_called_once()
@@ -301,7 +371,12 @@ class TestSegmentServiceMockedBehavior:
mock_segments = [Mock(), Mock()]
mock_get.return_value = (mock_segments, 2)
segments, count = SegmentService.get_segments(document_id=mock_document.id, page=1, limit=20)
segments, count = SegmentService.get_segments(
document_id=mock_document.id,
tenant_id=mock_document.tenant_id,
page=1,
limit=20,
)
assert len(segments) == 2
assert count == 2
@@ -429,13 +504,13 @@ class TestDocumentValidation:
"""Test that enabled=True is valid."""
document = Mock(spec=Document)
document.enabled = True
assert document.enabled is True
assert document.enabled
def test_document_enabled_false_is_invalid(self):
"""Test that enabled=False is invalid for segment operations."""
document = Mock(spec=Document)
document.enabled = False
assert document.enabled is False
assert not document.enabled
class TestDatasetModels:
@@ -462,7 +537,7 @@ class TestDatasetModels:
assert segment.id is not None
assert segment.document_id is not None
assert segment.content is not None
assert segment.content == "Test content"
def test_child_chunk_has_required_fields(self):
"""Test ChildChunk model has required fields."""
@@ -473,7 +548,7 @@ class TestDatasetModels:
assert chunk.id is not None
assert chunk.segment_id is not None
assert chunk.content is not None
assert chunk.content == "Chunk content"
class TestSegmentUpdatePayload:
@@ -594,6 +669,7 @@ class TestSegmentCreateArgs:
from services.entities.knowledge_entities.knowledge_entities import SegmentCreateArgs
args = SegmentCreateArgs(content="Test content", keywords=["machine learning", "AI", "neural networks"])
assert args.keywords is not None
assert len(args.keywords) == 3
@@ -690,7 +766,7 @@ class TestSegmentIndexingRequirements:
# Both conditions must be true
assert document.indexing_status == "completed"
assert document.enabled is True
assert document.enabled
class TestSegmentLimits:
@@ -753,7 +829,7 @@ class TestSegmentPagination:
#
# Strategy per decorator type:
# - No billing decorator → call the method directly; only patch ``db``,
# services, ``current_account_with_tenant``, and ``marshal``.
# services, ``current_account_with_tenant``, and response helpers when needed.
# - ``@cloud_edition_billing_rate_limit_check`` (preserves ``__wrapped__``)
# → call via ``method.__wrapped__(self, …)`` to skip the decorator.
# - ``@cloud_edition_billing_resource_check`` (no ``__wrapped__``) → patch
@@ -766,11 +842,11 @@ class TestSegmentApiGet:
"""Test suite for SegmentApi.get() endpoint.
``get`` has no billing decorators but calls
``current_account_with_tenant()`` and ``marshal``.
``current_account_with_tenant()`` and response serialization.
"""
@patch("controllers.service_api.dataset.segment.SummaryIndexService")
@patch("controllers.service_api.dataset.segment.marshal")
@patch("controllers.service_api.dataset.segment.segment_responses_with_summaries")
@patch("controllers.service_api.dataset.segment.SummaryIndexService.get_segments_summaries")
@patch("controllers.service_api.dataset.segment.SegmentService")
@patch("controllers.service_api.dataset.segment.DocumentService")
@patch("controllers.service_api.dataset.segment.current_account_with_tenant")
@@ -781,8 +857,8 @@ class TestSegmentApiGet:
mock_account_fn,
mock_doc_svc,
mock_seg_svc,
mock_marshal,
mock_summary_svc,
mock_get_summaries,
mock_dump_segments,
app: Flask,
mock_tenant,
mock_dataset,
@@ -794,8 +870,8 @@ class TestSegmentApiGet:
mock_db.session.scalar.return_value = mock_dataset
mock_doc_svc.get_document.return_value = Mock(doc_form=IndexStructureType.PARAGRAPH_INDEX)
mock_seg_svc.get_segments.return_value = ([mock_segment], 1)
mock_marshal.return_value = {"id": mock_segment.id}
mock_summary_svc.get_segments_summaries.return_value = {}
mock_get_summaries.return_value = {}
mock_dump_segments.return_value = [_segment_response_dict()]
# Act
with app.test_request_context(
@@ -881,8 +957,8 @@ class TestSegmentApiPost:
mock_rate_limit.enabled = False
mock_feature_svc.get_knowledge_rate_limit.return_value = mock_rate_limit
@patch("controllers.service_api.dataset.segment.SummaryIndexService")
@patch("controllers.service_api.dataset.segment.marshal")
@patch("controllers.service_api.dataset.segment.segment_responses_with_summaries")
@patch("controllers.service_api.dataset.segment.SummaryIndexService.get_segments_summaries")
@patch("controllers.service_api.dataset.segment.SegmentService")
@patch("controllers.service_api.dataset.segment.DocumentService")
@patch("controllers.service_api.dataset.segment.current_account_with_tenant")
@@ -897,8 +973,8 @@ class TestSegmentApiPost:
mock_account_fn,
mock_doc_svc,
mock_seg_svc,
mock_marshal,
mock_summary_svc,
mock_get_summaries,
mock_dump_segments,
app: Flask,
mock_tenant,
mock_dataset,
@@ -920,8 +996,8 @@ class TestSegmentApiPost:
mock_seg_svc.segment_create_args_validate.return_value = None
mock_seg_svc.multi_create_segment.return_value = [mock_segment]
mock_marshal.return_value = {"id": mock_segment.id}
mock_summary_svc.get_segments_summaries.return_value = {}
mock_get_summaries.return_value = {}
mock_dump_segments.return_value = [_segment_response_dict()]
segments_data = [{"content": "Test segment content", "answer": "Test answer"}]
@@ -1222,8 +1298,8 @@ class TestDatasetSegmentApiUpdate:
mock_rate_limit.enabled = False
mock_feature_svc.get_knowledge_rate_limit.return_value = mock_rate_limit
@patch("controllers.service_api.dataset.segment.SummaryIndexService")
@patch("controllers.service_api.dataset.segment.marshal")
@patch("controllers.service_api.dataset.segment.segment_response_with_summary")
@patch("controllers.service_api.dataset.segment.SummaryIndexService.get_segment_summary")
@patch("controllers.service_api.dataset.segment.SegmentService")
@patch("controllers.service_api.dataset.segment.DocumentService")
@patch("controllers.service_api.dataset.segment.DatasetService")
@@ -1240,8 +1316,8 @@ class TestDatasetSegmentApiUpdate:
mock_dataset_svc,
mock_doc_svc,
mock_seg_svc,
mock_marshal,
mock_summary_svc,
mock_get_summary,
mock_dump_segment,
app: Flask,
mock_tenant,
mock_dataset,
@@ -1253,12 +1329,13 @@ class TestDatasetSegmentApiUpdate:
mock_dataset.indexing_technique = "economy"
mock_db.session.scalar.return_value = mock_dataset
mock_dataset_svc.check_dataset_model_setting.return_value = None
mock_doc_svc.get_document.return_value = Mock()
mock_doc_svc.get_document.return_value = Mock(doc_form=IndexStructureType.PARAGRAPH_INDEX)
mock_seg_svc.get_segment_by_id.return_value = mock_segment
updated = Mock()
updated.id = "updated-seg"
mock_seg_svc.update_segment.return_value = updated
mock_marshal.return_value = {"id": mock_segment.id}
mock_summary_svc.get_segment_summary.return_value = None
mock_get_summary.return_value = None
mock_dump_segment.return_value = _segment_response_dict()
with app.test_request_context(
f"/datasets/{mock_dataset.id}/documents/doc-id/segments/{mock_segment.id}",
@@ -1365,11 +1442,11 @@ class TestDatasetSegmentApiGetSingle:
"""Test suite for DatasetSegmentApi.get() (single segment) endpoint.
``get`` has no billing decorators but calls
``current_account_with_tenant()`` and ``marshal``.
``current_account_with_tenant()`` and response serialization.
"""
@patch("controllers.service_api.dataset.segment.SummaryIndexService")
@patch("controllers.service_api.dataset.segment.marshal")
@patch("controllers.service_api.dataset.segment.segment_response_with_summary")
@patch("controllers.service_api.dataset.segment.SummaryIndexService.get_segment_summary")
@patch("controllers.service_api.dataset.segment.SegmentService")
@patch("controllers.service_api.dataset.segment.DocumentService")
@patch("controllers.service_api.dataset.segment.DatasetService")
@@ -1382,8 +1459,8 @@ class TestDatasetSegmentApiGetSingle:
mock_dataset_svc,
mock_doc_svc,
mock_seg_svc,
mock_marshal,
mock_summary_svc,
mock_get_summary,
mock_dump_segment,
app: Flask,
mock_tenant,
mock_dataset,
@@ -1396,8 +1473,8 @@ class TestDatasetSegmentApiGetSingle:
mock_doc = Mock(doc_form=IndexStructureType.PARAGRAPH_INDEX)
mock_doc_svc.get_document.return_value = mock_doc
mock_seg_svc.get_segment_by_id.return_value = mock_segment
mock_marshal.return_value = {"id": mock_segment.id}
mock_summary_svc.get_segment_summary.return_value = None
mock_get_summary.return_value = None
mock_dump_segment.return_value = _segment_response_dict()
with app.test_request_context(
f"/datasets/{mock_dataset.id}/documents/doc-id/segments/{mock_segment.id}",
@@ -1415,8 +1492,8 @@ class TestDatasetSegmentApiGetSingle:
assert "data" in response
assert response["doc_form"] == IndexStructureType.PARAGRAPH_INDEX
@patch("controllers.service_api.dataset.segment.SummaryIndexService")
@patch("controllers.service_api.dataset.segment.marshal")
@patch("controllers.service_api.dataset.segment.segment_response_with_summary")
@patch("controllers.service_api.dataset.segment.SummaryIndexService.get_segment_summary")
@patch("controllers.service_api.dataset.segment.SegmentService")
@patch("controllers.service_api.dataset.segment.DocumentService")
@patch("controllers.service_api.dataset.segment.DatasetService")
@@ -1429,8 +1506,8 @@ class TestDatasetSegmentApiGetSingle:
mock_dataset_svc,
mock_doc_svc,
mock_seg_svc,
mock_marshal,
mock_summary_svc,
mock_get_summary,
mock_dump_segment,
app: Flask,
mock_tenant,
mock_dataset,
@@ -1443,11 +1520,9 @@ class TestDatasetSegmentApiGetSingle:
mock_doc = Mock(doc_form=IndexStructureType.PARAGRAPH_INDEX)
mock_doc_svc.get_document.return_value = mock_doc
mock_seg_svc.get_segment_by_id.return_value = mock_segment
mock_marshal.return_value = {"id": mock_segment.id, "summary": None}
mock_summary_record = Mock()
mock_summary_record.summary_content = "This is the segment summary"
mock_summary_svc.get_segment_summary.return_value = mock_summary_record
mock_summary_record = Mock(summary_content="This is the segment summary")
mock_get_summary.return_value = mock_summary_record
mock_dump_segment.return_value = _segment_response_dict("This is the segment summary")
with app.test_request_context(
f"/datasets/{mock_dataset.id}/documents/doc-id/segments/{mock_segment.id}",
@@ -1565,10 +1640,9 @@ class TestChildChunkApiGet:
"""Test suite for ChildChunkApi.get() endpoint.
``get`` has no billing decorators but calls
``current_account_with_tenant()``, ``marshal``, and ``db``.
``current_account_with_tenant()``, response serialization, and ``db``.
"""
@patch("controllers.service_api.dataset.segment.marshal")
@patch("controllers.service_api.dataset.segment.SegmentService")
@patch("controllers.service_api.dataset.segment.DocumentService")
@patch("controllers.service_api.dataset.segment.current_account_with_tenant")
@@ -1579,7 +1653,6 @@ class TestChildChunkApiGet:
mock_account_fn,
mock_doc_svc,
mock_seg_svc,
mock_marshal,
app: Flask,
mock_tenant,
mock_dataset,
@@ -1591,11 +1664,10 @@ class TestChildChunkApiGet:
mock_seg_svc.get_segment_by_id.return_value = Mock()
mock_pagination = Mock()
mock_pagination.items = [Mock(), Mock()]
mock_pagination.items = [_child_chunk(), _child_chunk()]
mock_pagination.total = 2
mock_pagination.pages = 1
mock_seg_svc.get_child_chunks.return_value = mock_pagination
mock_marshal.return_value = [{"id": "c1"}, {"id": "c2"}]
with app.test_request_context(
f"/datasets/{mock_dataset.id}/documents/doc-id/segments/seg-id/child_chunks?page=1&limit=20",
@@ -1727,7 +1799,6 @@ class TestChildChunkApiPost:
mock_rate_limit.enabled = False
mock_feature_svc.get_knowledge_rate_limit.return_value = mock_rate_limit
@patch("controllers.service_api.dataset.segment.marshal")
@patch("controllers.service_api.dataset.segment.SegmentService")
@patch("controllers.service_api.dataset.segment.DocumentService")
@patch("controllers.service_api.dataset.segment.current_account_with_tenant")
@@ -1742,7 +1813,6 @@ class TestChildChunkApiPost:
mock_account_fn,
mock_doc_svc,
mock_seg_svc,
mock_marshal,
app: Flask,
mock_tenant,
mock_dataset,
@@ -1754,9 +1824,8 @@ class TestChildChunkApiPost:
mock_db.session.scalar.return_value = mock_dataset
mock_doc_svc.get_document.return_value = Mock()
mock_seg_svc.get_segment_by_id.return_value = Mock()
mock_child = Mock()
mock_child = _child_chunk()
mock_seg_svc.create_child_chunk.return_value = mock_child
mock_marshal.return_value = {"id": "child-1"}
with app.test_request_context(
f"/datasets/{mock_dataset.id}/documents/doc-id/segments/seg-id/child_chunks",