[ADD]Three party synchronization

1. Three party web website data access - Web site synchronization
Building a knowledge base by crawling web page data in batches through web crawlers
Web site synchronization utilizes crawler technology, which can automatically capture all websites under the same domain name through a single entry website. Currently, it supports up to 200 subpages. For compliance and security reasons, only static site crawling is supported, mainly used for quickly building knowledge bases on various document sites.
2. Feishu Knowledge Base
By configuring Feishu document permissions, a knowledge base can be built using Feishu documents, and the documents will not undergo secondary storage
3. Language Bird Knowledge Base
You can configure the permissions of the language bird document to build a knowledge base using the language bird document, and the document will not undergo secondary storage
This commit is contained in:
lixiangcheng1
2026-02-06 12:18:40 +08:00
parent c1941809e9
commit db46c186aa
30 changed files with 3422 additions and 1 deletions

View File

@@ -0,0 +1 @@
"""Integrations package for external services."""

View File

@@ -0,0 +1 @@
"""Feishu integration module for document synchronization."""

View File

@@ -0,0 +1,84 @@
"""Command-line interface for feishu integration."""
import asyncio
import sys
from app.core.rag.integrations.feishu.client import FeishuAPIClient
from app.core.rag.integrations.feishu.models import FileInfo
def main(feishu_app_id: str, # Feishu application ID
feishu_app_secret: str, # Feishu application secret
feishu_folder_token: str, # Feishu Folder Token
save_dir: str, # save file directory
feishu_api_base_url: str = "https://open.feishu.cn/open-apis", # Feishu API base URL
timeout: int = 30, # Request timeout in seconds
max_retries: int = 3, # Maximum number of retries
recursive: bool = True # recursive: Whether to sync subfolders recursively,
):
"""Main entry point for the feishuAPIClient."""
# Create feishuAPIClient
api_client = FeishuAPIClient(
app_id=feishu_app_id,
app_secret=feishu_app_secret,
api_base_url=feishu_api_base_url,
timeout=timeout,
max_retries=max_retries
)
# Get all files from folder
async def async_get_files(api_client: FeishuAPIClient, feishu_folder_token: str):
async with api_client as client:
if recursive:
files = await client.list_all_folder_files(feishu_folder_token, recursive=True)
else:
all_files = []
page_token = None
while True:
files_page, page_token = await client.list_folder_files(
feishu_folder_token, page_token
)
all_files.extend(files_page)
if not page_token:
break
files = all_files
return files
files = asyncio.run(async_get_files(api_client,feishu_folder_token))
# Filter out folders, only sync documents
# documents = [f for f in files if f.type in ["doc", "docx", "sheet", "bitable", "file", "slides"]]
documents = [f for f in files if f.type in ["doc", "docx", "sheet", "bitable", "file"]]
try:
for doc in documents:
print(f"\n{'=' * 80}")
print(f"token: {doc.token}")
print(f"name: {doc.name}")
print(f"type: {doc.type}")
print(f"created_time: {doc.created_time}")
print(f"modified_time: {doc.modified_time}")
print(f"owner_id: {doc.owner_id}")
print(f"url: {doc.url}")
print(f"{'=' * 80}\n")
# download document from Feishu FileInfo
async def async_download_document(api_client: FeishuAPIClient, doc: FileInfo, save_dir: str):
async with api_client as client:
file_path = await client.download_document(document=doc, save_dir=save_dir)
return file_path
file_path = asyncio.run(async_download_document(api_client, doc, save_dir))
print(file_path)
except KeyboardInterrupt:
print("\n\nfeishu integration interrupted by user.")
except Exception as e:
print(f"\n\nError during feishu integration: {e}")
sys.exit(1)
if __name__ == '__main__':
feishu_app_id = ""
feishu_app_secret = ""
feishu_folder_token = ""
save_dir = "/Volumes/MacintoshBD/Repository/RedBearAI/MemoryBear/api/files/"
main(feishu_app_id, feishu_app_secret, feishu_folder_token, save_dir)

View File

@@ -0,0 +1,452 @@
"""Feishu API client for document operations."""
import asyncio
import os
import re
from typing import Optional, Tuple, List
from datetime import datetime, timedelta
import httpx
from cachetools import TTLCache
import urllib.parse
from app.core.rag.integrations.feishu.exceptions import (
FeishuAuthError,
FeishuAPIError,
FeishuNotFoundError,
FeishuPermissionError,
FeishuRateLimitError,
FeishuNetworkError,
)
from app.core.rag.integrations.feishu.models import FileInfo
from app.core.rag.integrations.feishu.retry import with_retry
class FeishuAPIClient:
"""Feishu API client for document synchronization."""
def __init__(
self,
app_id: str,
app_secret: str,
api_base_url: str = "https://open.feishu.cn/open-apis",
timeout: int = 30,
max_retries: int = 3
):
"""
Initialize Feishu API client.
Args:
app_id: Feishu application ID
app_secret: Feishu application secret
api_base_url: Feishu API base URL
timeout: Request timeout in seconds
max_retries: Maximum number of retries
"""
self.app_id = app_id
self.app_secret = app_secret
self.api_base_url = api_base_url
self.timeout = timeout
self.max_retries = max_retries
self._http_client: Optional[httpx.AsyncClient] = None
self._token_cache: TTLCache = TTLCache(maxsize=1, ttl=7200 - 300) # 2 hours - 5 minutes
self._token_lock = asyncio.Lock()
async def __aenter__(self):
"""Async context manager entry."""
self._http_client = httpx.AsyncClient(
base_url=self.api_base_url,
timeout=self.timeout,
headers={"Content-Type": "application/json"}
)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit."""
if self._http_client:
await self._http_client.aclose()
async def get_tenant_access_token(self) -> str:
"""
Get tenant access token with caching.
Returns:
Access token string
Raises:
FeishuAuthError: If authentication fails
"""
# Check cache first
cached_token = self._token_cache.get("access_token")
if cached_token:
return cached_token
# Use lock to prevent concurrent token requests
async with self._token_lock:
# Double-check cache after acquiring lock
cached_token = self._token_cache.get("access_token")
if cached_token:
return cached_token
# Request new token
try:
if not self._http_client:
raise FeishuAuthError("HTTP client not initialized")
response = await self._http_client.post(
"/auth/v3/tenant_access_token/internal",
json={
"app_id": self.app_id,
"app_secret": self.app_secret
}
)
data = response.json()
if data.get("code") != 0:
error_msg = data.get("msg", "Unknown error")
raise FeishuAuthError(
f"Authentication failed: {error_msg}",
error_code=str(data.get("code")),
details=data
)
token = data.get("tenant_access_token")
if not token:
raise FeishuAuthError("No access token in response")
# Cache the token
self._token_cache["access_token"] = token
return token
except httpx.HTTPError as e:
raise FeishuAuthError(f"HTTP error during authentication: {str(e)}")
except Exception as e:
if isinstance(e, FeishuAuthError):
raise
raise FeishuAuthError(f"Unexpected error during authentication: {str(e)}")
@with_retry
async def list_folder_files(
self,
folder_token: str,
page_token: Optional[str] = None
) -> Tuple[List[FileInfo], Optional[str]]:
"""
Get list of files in a folder with pagination support.
Args:
folder_token: Folder token
page_token: Page token for pagination
Returns:
Tuple of (list of FileInfo, next page token)
Raises:
FeishuAPIError: If API call fails
FeishuNotFoundError: If folder not found
FeishuPermissionError: If permission denied
"""
try:
token = await self.get_tenant_access_token()
if not self._http_client:
raise FeishuAPIError("HTTP client not initialized")
# Build request parameters
params = {"page_size": 200, "folder_token": folder_token}
if page_token:
params["page_token"] = page_token
# Make API request
response = await self._http_client.get(
f"/drive/v1/files",
params=params,
headers={"Authorization": f"Bearer {token}"}
)
data = response.json()
# print(f"get files: {data}")
# Handle errors
if data.get("code") != 0:
error_code = data.get("code")
error_msg = data.get("msg", "Unknown error")
if error_code == 404 or error_code == 230005:
raise FeishuNotFoundError(
f"Folder not found: {error_msg}",
error_code=str(error_code),
details=data
)
elif error_code == 403 or error_code == 230003:
raise FeishuPermissionError(
f"Permission denied: {error_msg}",
error_code=str(error_code),
details=data
)
else:
raise FeishuAPIError(
f"API error: {error_msg}",
error_code=str(error_code),
details=data
)
# Parse response
files_data = data.get("data", {}).get("files", [])
next_page_token = data.get("data", {}).get("next_page_token", None)
# Convert to FileInfo objects
files = []
for file_data in files_data:
try:
file_info = FileInfo(
token=file_data.get("token", ""),
name=file_data.get("name", ""),
type=file_data.get("type", ""),
created_time=datetime.fromtimestamp(int(file_data.get("created_time", 0))),
modified_time=datetime.fromtimestamp(int(file_data.get("modified_time", 0))),
owner_id=file_data.get("owner_id", ""),
url=file_data.get("url", "")
)
files.append(file_info)
except (ValueError, TypeError) as e:
# Skip invalid file entries
continue
return files, next_page_token
except httpx.HTTPError as e:
raise FeishuAPIError(f"HTTP error: {str(e)}")
except Exception as e:
if isinstance(e, (FeishuAPIError, FeishuNotFoundError, FeishuPermissionError)):
raise
raise FeishuAPIError(f"Unexpected error: {str(e)}")
async def list_all_folder_files(
self,
folder_token: str,
recursive: bool = True
) -> List[FileInfo]:
"""
Get all files in a folder, handling pagination automatically.
Args:
folder_token: Folder token
recursive: Whether to recursively get files from subfolders
Returns:
List of all FileInfo objects
Raises:
FeishuAPIError: If API call fails
"""
all_files = []
page_token = None
# Get all files with pagination
while True:
files, page_token = await self.list_folder_files(folder_token, page_token)
all_files.extend(files)
if not page_token:
break
# Recursively get files from subfolders if requested
if recursive:
subfolders = [f for f in all_files if f.type == "folder"]
for subfolder in subfolders:
try:
subfolder_files = await self.list_all_folder_files(
subfolder.token,
recursive=True
)
all_files.extend(subfolder_files)
except Exception:
# Continue with other folders if one fails
continue
return all_files
@with_retry
async def download_document(
self,
document: FileInfo,
save_dir: str
) -> str:
"""
download document content.
Args:
document: Document FileInfo
save_dir: save dir
Returns:
file_full_path
Raises:
FeishuAPIError: If API call fails
FeishuNotFoundError: If document not found
FeishuPermissionError: If permission denied
"""
try:
token = await self.get_tenant_access_token()
if not self._http_client:
raise FeishuAPIError("HTTP client not initialized")
# Different API endpoints for different document types
if document.type == "doc" or document.type == "docx" or document.type == "sheet" or document.type == "bitable":
return await self._export_file(document, token, save_dir)
elif document.type == "file" or document.type == "slides":
return await self._download_file(document, token, save_dir)
else:
raise FeishuAPIError(f"Unsupported document type: {document.type}")
except Exception as e:
if isinstance(e, (FeishuAPIError, FeishuNotFoundError, FeishuPermissionError)):
raise
raise FeishuAPIError(f"Unexpected error: {str(e)}")
async def _export_file(self, document: FileInfo, access_token: str, save_dir: str) -> str:
"""export file for feishu online file type."""
try:
# 1.创建导出任务
file_extension = "pdf"
match document.type:
case "doc":
file_extension = "doc"
case "docx":
file_extension = "docx"
case "sheet":
file_extension = "xlsx"
case "bitable":
file_extension = "xlsx"
case _:
file_extension = "pdf"
response = await self._http_client.post(
"/drive/v1/export_tasks",
json={
"file_extension": file_extension,
"token": document.token,
"type": document.type
},
headers={"Authorization": f"Bearer {access_token}"}
)
data = response.json()
print(f"1.创建导出任务: {data}")
if data.get("code") != 0:
error_code = data.get("code")
error_msg = data.get("msg", "Unknown error")
raise FeishuAPIError(
f"API error: {error_msg}",
error_code=str(error_code),
details=data
)
ticket = data.get("data", {}).get("ticket", None)
if not ticket:
raise FeishuAuthError("No ticket in response")
# 2.轮序查询导出任务结果
max_retries = 10 # 最大轮询次数
poll_interval = 2 # 每次轮询间隔时间(秒)
file_token = None
for attempt in range(max_retries):
# 查询导出任务
response = await self._http_client.get(
f"/drive/v1/export_tasks/{ticket}",
params={"token": document.token},
headers={"Authorization": f"Bearer {access_token}"}
)
data = response.json()
print(f"2. 尝试查询导出任务结果 (第{attempt + 1}次): {data}")
if data.get("code") != 0:
error_code = data.get("code")
error_msg = data.get("msg", "Unknown error")
raise FeishuAPIError(
f"API error: {error_msg}",
error_code=str(error_code),
details=data,
)
# 检查导出任务结果
file_token = data.get("data", {}).get("result", {}).get("file_token", None)
if file_token:
# 如果导出任务成功生成 file_token则退出轮询
break
# 如果结果还没准备好,等待一段时间再进行下一次轮询
await asyncio.sleep(poll_interval)
if not file_token:
raise FeishuAPIError("Export task did not complete within the allowed time")
# 3.下载导出任务
response = await self._http_client.get(
f"/drive/v1/export_tasks/file/{file_token}/download",
headers={"Authorization": f"Bearer {access_token}"}
)
response.raise_for_status()
print(f'3.下载导出任务: {response.headers.get("Content-Disposition")}')
file_full_path = os.path.join(save_dir, document.name + "." + file_extension)
if os.path.exists(file_full_path):
os.remove(file_full_path) # Delete a single file
with open(file_full_path, "wb") as file:
file.write(response.content)
return file_full_path
except httpx.HTTPError as e:
raise FeishuAPIError(f"HTTP error: {str(e)}")
except Exception as e:
raise FeishuAPIError(f"Unexpected error during file download: {str(e)}")
async def _download_file(self, document: FileInfo, access_token: str, save_dir: str) -> str:
"""download file for file type."""
try:
response = await self._http_client.get(
f"/drive/v1/files/{document.token}/download",
headers={"Authorization": f"Bearer {access_token}"}
)
response.raise_for_status()
filename_header = response.headers.get("Content-Disposition")
# 最终的文件名(初始化为 None
filename = None
if filename_header:
# 优先解析 filename* 格式
match = re.search(r"filename\*=([^']*)''([^;]+)", filename_header)
if match:
# 使用 `filename*` 提取(已编码)
encoding = match.group(1) # 编码部分(如 UTF-8
encoded_filename = match.group(2) # 文件名部分
filename = urllib.parse.unquote(encoded_filename) # 解码 URL 编码的文件名
# 如果 `filename*` 不存在,回退到解析 `filename`
if not filename:
match = re.search(r'filename="([^"]+)"', filename_header)
if match:
filename = match.group(1)
# 如果文件名仍为 None则使用默认文件名
if not filename:
filename = f"{document.name}.pdf"
# 确保文件名合法,替换非法字符
filename = re.sub(r'[\/:*?"<>|]', '_', filename)
file_full_path = os.path.join(save_dir, filename)
if os.path.exists(file_full_path):
os.remove(file_full_path) # Delete a single file
with open(file_full_path, "wb") as file:
file.write(response.content)
return file_full_path
except httpx.HTTPError as e:
raise FeishuAPIError(f"HTTP error: {str(e)}")
except Exception as e:
raise FeishuAPIError(f"Unexpected error during file download: {str(e)}")

View File

@@ -0,0 +1,46 @@
"""Exception classes for Feishu integration."""
class FeishuError(Exception):
"""Base exception for all Feishu-related errors."""
def __init__(self, message: str, error_code: str = None, details: dict = None):
super().__init__(message)
self.message = message
self.error_code = error_code
self.details = details or {}
class FeishuAuthError(FeishuError):
"""Authentication error with Feishu API."""
pass
class FeishuAPIError(FeishuError):
"""General API error from Feishu."""
pass
class FeishuNotFoundError(FeishuError):
"""Resource not found error (404)."""
pass
class FeishuPermissionError(FeishuError):
"""Permission denied error (403)."""
pass
class FeishuRateLimitError(FeishuError):
"""Rate limit exceeded error (429)."""
pass
class FeishuNetworkError(FeishuError):
"""Network-related error (timeout, connection failure)."""
pass
class FeishuDataError(FeishuError):
"""Data parsing or validation error."""
pass

View File

@@ -0,0 +1,17 @@
"""Data models for Feishu integration."""
from dataclasses import dataclass
from datetime import datetime
from typing import Dict, Any, List, Optional
@dataclass
class FileInfo:
"""File information from Feishu."""
token: str
name: str
type: str # doc/docx/sheet/bitable/file/slides/folder
created_time: datetime
modified_time: datetime
owner_id: str
url: str

View File

@@ -0,0 +1,137 @@
"""Retry strategy for Feishu API calls."""
import asyncio
import functools
from typing import Callable, TypeVar
import httpx
from app.core.rag.integrations.feishu.exceptions import (
FeishuAuthError,
FeishuPermissionError,
FeishuNotFoundError,
FeishuRateLimitError,
FeishuNetworkError,
FeishuDataError,
FeishuAPIError,
)
T = TypeVar('T')
class RetryStrategy:
"""Retry strategy for API calls."""
# Retryable error types
RETRYABLE_ERRORS = (
FeishuNetworkError,
FeishuRateLimitError,
httpx.TimeoutException,
httpx.ConnectError,
httpx.ReadError,
)
# Non-retryable error types
NON_RETRYABLE_ERRORS = (
FeishuAuthError,
FeishuPermissionError,
FeishuNotFoundError,
FeishuDataError,
)
# Retry configuration
MAX_RETRIES = 3
BACKOFF_DELAYS = [1, 2, 4] # seconds
@classmethod
def is_retryable(cls, error: Exception) -> bool:
"""Check if an error is retryable."""
# Check for specific retryable errors
if isinstance(error, cls.RETRYABLE_ERRORS):
return True
# Check for non-retryable errors
if isinstance(error, cls.NON_RETRYABLE_ERRORS):
return False
# Check for HTTP status codes
if isinstance(error, httpx.HTTPStatusError):
status_code = error.response.status_code
# Retry on 429 (rate limit), 503 (service unavailable), 502 (bad gateway)
if status_code in [429, 502, 503]:
return True
# Don't retry on 4xx errors (except 429)
if 400 <= status_code < 500:
return False
# Retry on 5xx errors
if 500 <= status_code < 600:
return True
# Check for FeishuAPIError with specific codes
if isinstance(error, FeishuAPIError):
if error.error_code:
# Rate limit error codes
if error.error_code in ["99991400", "99991401"]:
return True
return False
@classmethod
async def execute_with_retry(
cls,
func: Callable[..., T],
*args,
**kwargs
) -> T:
"""
Execute a function with retry logic.
Args:
func: Async function to execute
*args: Positional arguments for the function
**kwargs: Keyword arguments for the function
Returns:
Function result
Raises:
Exception: The last exception if all retries fail
"""
last_exception = None
for attempt in range(cls.MAX_RETRIES + 1):
try:
return await func(*args, **kwargs)
except Exception as e:
last_exception = e
# Don't retry if not retryable
if not cls.is_retryable(e):
raise
# Don't retry if this was the last attempt
if attempt >= cls.MAX_RETRIES:
raise
# Wait before retrying
delay = cls.BACKOFF_DELAYS[attempt] if attempt < len(cls.BACKOFF_DELAYS) else cls.BACKOFF_DELAYS[-1]
await asyncio.sleep(delay)
# Should not reach here, but raise last exception if we do
if last_exception:
raise last_exception
def with_retry(func: Callable[..., T]) -> Callable[..., T]:
"""
Decorator to add retry logic to async functions.
Usage:
@with_retry
async def my_api_call():
...
"""
@functools.wraps(func)
async def wrapper(*args, **kwargs):
return await RetryStrategy.execute_with_retry(func, *args, **kwargs)
return wrapper

View File

@@ -0,0 +1 @@
"""Yuque integration module for document synchronization."""

View File

@@ -0,0 +1,77 @@
"""Main entry point for Yuque integration testing."""
import asyncio
import sys
from app.core.rag.integrations.yuque.client import YuqueAPIClient
from app.core.rag.integrations.yuque.models import YuqueDocInfo
def main(yuque_user_id: str, # yuque User ID
yuque_token: str, # yuque Token
save_dir: str, # save file directory
):
"""Main entry point for the YuqueAPIClient."""
# Create feishuAPIClient
api_client = YuqueAPIClient(
user_id=yuque_user_id,
token=yuque_token
)
# Get all files from all repos
async def async_get_files(api_client: YuqueAPIClient):
async with api_client as client:
print("\n=== Fetching repositories ===")
repos = await client.get_user_repos()
print(f"Found {len(repos)} repositories:")
all_files = []
for repo in repos:
# Get documents from repository
print(f"\n=== Fetching documents from '{repo.name}' ===")
docs = await client.get_repo_docs(repo.id)
all_files.extend(docs)
return all_files
files = asyncio.run(async_get_files(api_client))
try:
for doc in files:
print(f"\n{'=' * 80}")
print(f"id: {doc.id}")
print(f"type: {doc.type}")
print(f"slug: {doc.slug}")
print(f"title: {doc.title}")
print(f"book_id: {doc.book_id}")
# print(f"format: {doc.format}")
# print(f"body: {doc.body}")
# print(f"body_draft: {doc.body_draft}")
# print(f"body_html: {doc.body_html}")
print(f"public: {doc.public}")
print(f"status: {doc.status}")
print(f"created_at: {doc.created_at}")
print(f"updated_at: {doc.updated_at}")
print(f"published_at: {doc.published_at}")
print(f"word_count: {doc.word_count}")
print(f"cover: {doc.cover}")
print(f"description: {doc.description}")
print(f"{'=' * 80}\n")
# download document from Feishu FileInfo
async def async_download_document(api_client: YuqueAPIClient, doc: YuqueDocInfo, save_dir: str):
async with api_client as client:
file_path = await client.download_document(doc, save_dir)
return file_path
file_path = asyncio.run(async_download_document(api_client, doc, save_dir))
print(file_path)
except KeyboardInterrupt:
print("\n\nfeishu integration interrupted by user.")
except Exception as e:
print(f"\n\nError during feishu integration: {e}")
sys.exit(1)
if __name__ == "__main__":
yuque_user_id = ""
yuque_token = ""
save_dir = "/Volumes/MacintoshBD/Repository/RedBearAI/MemoryBear/api/files/"
main(yuque_user_id, yuque_token, save_dir)

View File

@@ -0,0 +1,544 @@
"""Yuque API client for document operations."""
import os
import re
from typing import Optional, List
from datetime import datetime, timedelta
import httpx
import urllib.parse
import json
from openpyxl import Workbook
from openpyxl.styles import Font, Alignment, PatternFill
from openpyxl.utils import get_column_letter
import zlib
from app.core.rag.integrations.yuque.exceptions import (
YuqueAuthError,
YuqueAPIError,
YuqueNotFoundError,
YuquePermissionError,
YuqueRateLimitError,
YuqueNetworkError,
)
from app.core.rag.integrations.yuque.models import YuqueDocInfo, YuqueRepoInfo
from app.core.rag.integrations.yuque.retry import with_retry
class YuqueAPIClient:
"""Yuque API client for document synchronization."""
def __init__(
self,
user_id: str,
token: str,
api_base_url: str = "https://www.yuque.com/api/v2",
timeout: int = 30,
max_retries: int = 3
):
"""
Initialize Yuque API client.
Args:
user_id: Yuque user ID or login name
token: Yuque personal access token
api_base_url: Yuque API base URL
timeout: Request timeout in seconds
max_retries: Maximum number of retries
"""
self.user_id = user_id
self.token = token
self.api_base_url = api_base_url
self.timeout = timeout
self.max_retries = max_retries
self._http_client: Optional[httpx.AsyncClient] = None
async def __aenter__(self):
"""Async context manager entry."""
self._http_client = httpx.AsyncClient(
base_url=self.api_base_url,
timeout=self.timeout,
headers={
"Content-Type": "application/json",
"X-Auth-Token": self.token,
"User-Agent": "Yuque-Integration-Client"
}
)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit."""
if self._http_client:
await self._http_client.aclose()
def _handle_api_error(self, response: httpx.Response):
"""Handle API error responses."""
try:
data = response.json()
except Exception:
data = {}
status_code = response.status_code
error_msg = data.get("message", "Unknown error")
# Rate limit errors
if status_code == 429:
raise YuqueRateLimitError(
f"Rate limit exceeded: {error_msg}",
error_code=str(status_code),
details=data
)
# Not found errors
elif status_code == 404:
raise YuqueNotFoundError(
f"Resource not found: {error_msg}",
error_code=str(status_code),
details=data
)
# Permission errors
elif status_code == 403:
raise YuquePermissionError(
f"Permission denied: {error_msg}",
error_code=str(status_code),
details=data
)
# Authentication errors
elif status_code == 401:
raise YuqueAuthError(
f"Authentication failed: {error_msg}",
error_code=str(status_code),
details=data
)
# Generic API error
else:
raise YuqueAPIError(
f"API error: {error_msg}",
error_code=str(status_code),
details=data
)
@with_retry
async def get_user_repos(self) -> List[YuqueRepoInfo]:
"""
Get all repositories (知识库) for the user.
Returns:
List of YuqueRepoInfo objects
Raises:
YuqueAPIError: If API call fails
"""
try:
if not self._http_client:
raise YuqueAPIError("HTTP client not initialized")
response = await self._http_client.get(f"/users/{self.user_id}/repos")
if response.status_code != 200:
self._handle_api_error(response)
data = response.json()
repos_data = data.get("data", [])
repos = []
for repo_data in repos_data:
try:
repo = YuqueRepoInfo(
id=repo_data.get("id"),
type=repo_data.get("type", ""),
name=repo_data.get("name", ""),
namespace=repo_data.get("namespace", ""),
slug=repo_data.get("slug", ""),
description=repo_data.get("description"),
public=repo_data.get("public", 0),
items_count=repo_data.get("items_count", 0),
created_at=datetime.fromisoformat(repo_data.get("created_at", "").replace("Z", "+00:00")),
updated_at=datetime.fromisoformat(repo_data.get("updated_at", "").replace("Z", "+00:00"))
)
repos.append(repo)
except (ValueError, TypeError, KeyError) as e:
# Skip invalid repo entries
continue
return repos
except httpx.HTTPError as e:
raise YuqueAPIError(f"HTTP error: {str(e)}")
except Exception as e:
if isinstance(e, (YuqueAPIError, YuqueAuthError)):
raise
raise YuqueAPIError(f"Unexpected error: {str(e)}")
@with_retry
async def get_repo_docs(self, book_id: int) -> List[YuqueDocInfo]:
"""
Get all documents in a repository.
Args:
book_id: repository id
Returns:
List of YuqueDocInfo objects (without body content)
Raises:
YuqueAPIError: If API call fails
"""
try:
if not self._http_client:
raise YuqueAPIError("HTTP client not initialized")
response = await self._http_client.get(f"/repos/{book_id}/docs")
if response.status_code != 200:
self._handle_api_error(response)
data = response.json()
docs_data = data.get("data", [])
docs = []
for doc_data in docs_data:
try:
published_at = doc_data.get("published_at")
doc = YuqueDocInfo(
id=doc_data.get("id"),
type=doc_data.get("type", ""),
slug=doc_data.get("slug", ""),
title=doc_data.get("title", ""),
book_id=doc_data.get("book_id"),
format=doc_data.get("format", "markdown"),
body=None, # Body not included in list API
body_draft=None,
body_html=None,
public=doc_data.get("public", 0),
status=doc_data.get("status", 0),
created_at=datetime.fromisoformat(doc_data.get("created_at", "").replace("Z", "+00:00")),
updated_at=datetime.fromisoformat(doc_data.get("updated_at", "").replace("Z", "+00:00")),
published_at=datetime.fromisoformat(published_at.replace("Z", "+00:00")) if published_at else None,
word_count=doc_data.get("word_count", 0),
cover=doc_data.get("cover"),
description=doc_data.get("description")
)
docs.append(doc)
except (ValueError, TypeError, KeyError) as e:
# Skip invalid doc entries
continue
return docs
except httpx.HTTPError as e:
raise YuqueAPIError(f"HTTP error: {str(e)}")
except Exception as e:
if isinstance(e, (YuqueAPIError, YuqueNotFoundError)):
raise
raise YuqueAPIError(f"Unexpected error: {str(e)}")
@with_retry
async def get_doc_detail(self, id: int) -> YuqueDocInfo:
"""
Get detailed document information including content.
Args:
id: document ID
Returns:
YuqueDocInfo object with full content
Raises:
YuqueAPIError: If API call fails
"""
try:
if not self._http_client:
raise YuqueAPIError("HTTP client not initialized")
response = await self._http_client.get(
f"/repos/docs/{id}",
params={"raw": 1} # Get raw markdown content
)
if response.status_code != 200:
self._handle_api_error(response)
data = response.json()
doc_data = data.get("data", {})
published_at = doc_data.get("published_at")
doc = YuqueDocInfo(
id=doc_data.get("id"),
type=doc_data.get("type", ""),
slug=doc_data.get("slug", ""),
title=doc_data.get("title", ""),
book_id=doc_data.get("book_id"),
format=doc_data.get("format", "markdown"),
body=doc_data.get("body", ""),
body_draft=doc_data.get("body_draft"),
body_html=doc_data.get("body_html"),
public=doc_data.get("public", 0),
status=doc_data.get("status", 0),
created_at=datetime.fromisoformat(doc_data.get("created_at", "").replace("Z", "+00:00")),
updated_at=datetime.fromisoformat(doc_data.get("updated_at", "").replace("Z", "+00:00")),
published_at=datetime.fromisoformat(published_at.replace("Z", "+00:00")) if published_at else None,
word_count=doc_data.get("word_count", 0),
cover=doc_data.get("cover"),
description=doc_data.get("description")
)
return doc
except httpx.HTTPError as e:
raise YuqueAPIError(f"HTTP error: {str(e)}")
except Exception as e:
if isinstance(e, (YuqueAPIError, YuqueNotFoundError)):
raise
raise YuqueAPIError(f"Unexpected error: {str(e)}")
async def download_document(
self,
doc: YuqueDocInfo,
save_dir: str
) -> str:
"""
Download document content to local file.
Args:
doc: Document info (can be without body)
save_dir: Directory to save the file
Returns:
Full path to the saved file
Raises:
YuqueAPIError: If download fails
"""
try:
# Get full document content if not already loaded
if not doc.body:
doc = await self.get_doc_detail(doc.id)
# Sanitize filename
filename = re.sub(r'[\/:*?"<>|]', '_', doc.title)
# Determine file extension based on format
content = doc.body or ""
if doc.format == "markdown":
file_extension = "md"
elif doc.format == "lake":
file_extension = "md" # Save lake format as markdown
elif doc.format == "html":
file_extension = "html"
elif doc.format == "lakesheet":
file_extension = "xlsx"
body_data = json.loads(doc.body)
sheet_data = body_data.get("sheet", "")
try:
sheet_raw = zlib.decompress(bytes(sheet_data, 'latin-1'))
except Exception as e:
print(f"Error decompressing sheet data: {e}")
raise ValueError("Invalid or unsupported sheet data format.")
try:
sheet_text = sheet_raw.decode("utf-8") # 假设是 UTF-8 编码
except UnicodeDecodeError:
sheet_text = sheet_raw.decode("gbk") # 如果 UTF-8 解码失败,尝试 GBK
file_full_path = os.path.join(save_dir, f"{filename}.{file_extension}")
self.generate_excel_from_sheet(sheet_text, file_full_path)
return file_full_path
else:
file_extension = "txt"
file_full_path = os.path.join(save_dir, f"{filename}.{file_extension}")
# Remove existing file if it exists
if os.path.exists(file_full_path):
os.remove(file_full_path)
# Write content to file
with open(file_full_path, "w", encoding="utf-8") as file:
file.write(content)
return file_full_path
except Exception as e:
if isinstance(e, YuqueAPIError):
raise
raise YuqueAPIError(f"Unexpected error during file download: {str(e)}")
def generate_excel_from_sheet(self, sheet_text: str, save_path: str):
"""
将解析的 sheet_text 数据转换为 Excel 文件。
Args:
sheet_text (str): JSON 格式的 sheet 数据。
save_path (str): Excel 文件的保存路径。
"""
try:
# 解析 JSON 数据
sheets = json.loads(sheet_text)
if not isinstance(sheets, list):
raise ValueError("sheet_text must be a JSON array of sheets.")
# 创建一个新的 Excel 工作簿
workbook = Workbook()
for sheet_index, sheet_data in enumerate(sheets):
sheet_name = sheet_data.get("name", f"Sheet{sheet_index + 1}")
row_data = sheet_data.get("data", {})
merge_cells = sheet_data.get("mergeCells", {})
rows_styles = sheet_data.get("rows", [])
cols_styles = sheet_data.get("columns", [])
# 创建 Sheet
if sheet_index == 0:
worksheet = workbook.active
worksheet.title = sheet_name
else:
worksheet = workbook.create_sheet(title=sheet_name)
# 设置列宽
for col_index, col_style in enumerate(cols_styles):
col_width = col_style.get("size", 82.125) / 7.0
col_letter = get_column_letter(col_index + 1) # Excel 列从1开始
worksheet.column_dimensions[col_letter].width = col_width
# 设置行高
for row_index, row_style in enumerate(rows_styles):
row_height = row_style.get("size", 24) / 1.5
worksheet.row_dimensions[row_index + 1].height = row_height
# 写入单元格数据
for r_index, row in row_data.items():
for c_index, cell in row.items():
# 防御性检查:确保行号和列号都是有效的整数
try:
row_number = int(r_index) + 1
col_number = int(c_index) + 1
except ValueError:
print(f"Invalid row or column index: r_index={r_index}, c_index={c_index}")
continue
if col_number < 1 or col_number > 16384: # Excel 最大列数支持到 XFD即 16384 列
print(f"Invalid column index: c_index={c_index}")
continue
cell_obj = worksheet.cell(row=row_number, column=col_number)
# 处理值和公式
cell_value = cell.get("value", "")
if isinstance(cell_value, dict):
# 检查是否为公式
if cell_value.get("class") == "formula" and "formula" in cell_value:
cell_obj.value = f"={cell_value['formula']}" # 写入公式
else:
cell_obj.value = cell_value.get("value", "") # 写入值
else:
cell_obj.value = cell_value # 写入简单值
# 应用样式
style = cell.get("style", {})
self.apply_cell_style(cell_obj, style)
# 合并单元格
for key, merge_def in merge_cells.items():
start_row = merge_def["row"] + 1
start_col = merge_def["col"] + 1
end_row = start_row + merge_def["rowCount"] - 1
end_col = start_col + merge_def["colCount"] - 1
worksheet.merge_cells(
start_row=start_row, start_column=start_col, end_row=end_row, end_column=end_col
)
# 保存 Excel 文件
workbook.save(save_path)
print(f"Excel file successfully saved to: {save_path}")
except Exception as e:
print(f"Error generating Excel file: {e}")
def apply_cell_style(self, cell, style):
"""
应用单元格样式,包括字体、对齐、背景颜色等。
Args:
cell: openpyxl 的单元格对象。
style: 字典格式的样式信息。
"""
# 定义允许的对齐值
allowed_horizontal_alignments = {"general", "left", "center", "centerContinuous", "right", "fill", "justify",
"distributed"}
allowed_vertical_alignments = {"top", "center", "justify", "distributed", "bottom"}
# 处理字体
font = Font(
size=style.get("fontSize", 11),
bold=style.get("fontWeight", False),
italic=style.get("fontStyle", "normal") == "italic",
underline="single" if style.get("underline", False) else None,
color=self.convert_color_to_hex(style.get("color", "#000000")),
)
cell.font = font
# 处理对齐方式
horizontal_alignment = style.get("hAlign", "left")
vertical_alignment = style.get("vAlign", "top")
# 如果对齐值无效,则使用默认值
if horizontal_alignment not in allowed_horizontal_alignments:
horizontal_alignment = "left"
if vertical_alignment not in allowed_vertical_alignments:
vertical_alignment = "top"
alignment = Alignment(
horizontal=horizontal_alignment,
vertical=vertical_alignment,
wrap_text=style.get("overflow") == "wrap",
)
cell.alignment = alignment
# 处理背景颜色
background_color = style.get("backColor", None)
if background_color:
hex_color = self.convert_color_to_hex(background_color)
if hex_color:
cell.fill = PatternFill(
start_color=hex_color,
end_color=hex_color,
fill_type="solid"
)
def convert_color_to_hex(self, color):
"""
将颜色从 `rgba(...)` 或 `rgb(...)` 转换为 aRGB 十六进制格式。
Args:
color (str): 原始颜色字符串,如 `rgba(255,255,0,1.00)` 或 `#FFFFFF`。
Returns:
str: 转换后的颜色字符串(符合 openpyxl 的格式),例如 `FFFF0000`。
"""
try:
if not color:
return None
# 如果是 `#RRGGBB` 或 `#AARRGGBB` 格式,直接返回
if color.startswith("#"):
return color.lstrip("#").upper()
# 如果是 `rgb(...)` 格式,例如 `rgb(255,255,0)`
if color.startswith("rgb("):
rgb_values = color.strip("rgb()").split(",")
red, green, blue = [int(v) for v in rgb_values]
return f"FF{red:02X}{green:02X}{blue:02X}"
# 如果是 `rgba(...)` 格式,例如 `rgba(255,255,0,1.00)`
if color.startswith("rgba("):
rgba_values = color.strip("rgba()").split(",")
red, green, blue = [int(v) for v in rgba_values[:3]]
alpha = float(rgba_values[3])
alpha_hex = int(alpha * 255) # 将透明度转换为 [00, FF]
return f"{alpha_hex:02X}{red:02X}{green:02X}{blue:02X}"
# 返回默认颜色
return None
except Exception as e:
print(f"Error parsing color '{color}': {e}")
return None

View File

@@ -0,0 +1,46 @@
"""Exception classes for Yuque integration."""
class YuqueError(Exception):
"""Base exception for all Yuque-related errors."""
def __init__(self, message: str, error_code: str = None, details: dict = None):
super().__init__(message)
self.message = message
self.error_code = error_code
self.details = details or {}
class YuqueAuthError(YuqueError):
"""Authentication error with Yuque API."""
pass
class YuqueAPIError(YuqueError):
"""General API error from Yuque."""
pass
class YuqueNotFoundError(YuqueError):
"""Resource not found error (404)."""
pass
class YuquePermissionError(YuqueError):
"""Permission denied error (403)."""
pass
class YuqueRateLimitError(YuqueError):
"""Rate limit exceeded error (429)."""
pass
class YuqueNetworkError(YuqueError):
"""Network-related error (timeout, connection failure)."""
pass
class YuqueDataError(YuqueError):
"""Data parsing or validation error."""
pass

View File

@@ -0,0 +1,42 @@
"""Data models for Yuque integration."""
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
@dataclass
class YuqueRepoInfo:
"""Repository (知识库) information from Yuque."""
id: int # 知识库 ID
type: str # 类型 (Book:文档, Design:图集, Sheet:表格, Resource:资源)
name: str # 名称
namespace: str # 完整路径: user/repo format
slug: str # 路径
description: Optional[str] # 简介
public: int # 公开性 (0:私密, 1:公开, 2:企业内公开)
items_count: int # 文档数量
created_at: datetime # 创建时间
updated_at: datetime # 更新时间
@dataclass
class YuqueDocInfo:
"""Document information from Yuque."""
id: int # 文档 ID
type: str # 文档类型 (Doc:普通文档, Sheet:表格, Thread:话题, Board:图集, Table:数据表)
slug: str # 路径
title: str # 标题
book_id: int # 归属知识库 ID
format: str # 内容格式 (markdown:Markdown 格式, lake:语雀 Lake 格式, html:HTML 标准格式, lakesheet:语雀表格)
body: Optional[str] # 正文原始内容
body_draft: Optional[str] # 正文草稿内容
body_html: Optional[str] # 正文 HTML 标准格式内容
public: int # 公开性 (0:私密, 1:公开, 2:企业内公开)
status: int # 状态 (0:草稿, 1:发布)
created_at: datetime # 创建时间
updated_at: datetime # 更新时间
published_at: Optional[datetime] # 发布时间
word_count: int # 内容字数
cover: Optional[str] # 封面
description: Optional[str] # 摘要

View File

@@ -0,0 +1,134 @@
"""Retry strategy for Yuque API calls."""
import asyncio
import functools
from typing import Callable, TypeVar
import httpx
from app.core.rag.integrations.yuque.exceptions import (
YuqueAuthError,
YuquePermissionError,
YuqueNotFoundError,
YuqueRateLimitError,
YuqueNetworkError,
YuqueDataError,
YuqueAPIError,
)
T = TypeVar('T')
class RetryStrategy:
"""Retry strategy for API calls."""
# Retryable error types
RETRYABLE_ERRORS = (
YuqueNetworkError,
YuqueRateLimitError,
httpx.TimeoutException,
httpx.ConnectError,
httpx.ReadError,
)
# Non-retryable error types
NON_RETRYABLE_ERRORS = (
YuqueAuthError,
YuquePermissionError,
YuqueNotFoundError,
YuqueDataError,
)
# Retry configuration
MAX_RETRIES = 3
BACKOFF_DELAYS = [1, 2, 4] # seconds
@classmethod
def is_retryable(cls, error: Exception) -> bool:
"""Check if an error is retryable."""
# Check for specific retryable errors
if isinstance(error, cls.RETRYABLE_ERRORS):
return True
# Check for non-retryable errors
if isinstance(error, cls.NON_RETRYABLE_ERRORS):
return False
# Check for HTTP status codes
if isinstance(error, httpx.HTTPStatusError):
status_code = error.response.status_code
# Retry on 429 (rate limit), 503 (service unavailable), 502 (bad gateway)
if status_code in [429, 502, 503]:
return True
# Don't retry on 4xx errors (except 429)
if 400 <= status_code < 500:
return False
# Retry on 5xx errors
if 500 <= status_code < 600:
return True
# Check for YuqueRateLimitError
if isinstance(error, YuqueRateLimitError):
return True
return False
@classmethod
async def execute_with_retry(
cls,
func: Callable[..., T],
*args,
**kwargs
) -> T:
"""
Execute a function with retry logic.
Args:
func: Async function to execute
*args: Positional arguments for the function
**kwargs: Keyword arguments for the function
Returns:
Function result
Raises:
Exception: The last exception if all retries fail
"""
last_exception = None
for attempt in range(cls.MAX_RETRIES + 1):
try:
return await func(*args, **kwargs)
except Exception as e:
last_exception = e
# Don't retry if not retryable
if not cls.is_retryable(e):
raise
# Don't retry if this was the last attempt
if attempt >= cls.MAX_RETRIES:
raise
# Wait before retrying
delay = cls.BACKOFF_DELAYS[attempt] if attempt < len(cls.BACKOFF_DELAYS) else cls.BACKOFF_DELAYS[-1]
await asyncio.sleep(delay)
# Should not reach here, but raise last exception if we do
if last_exception:
raise last_exception
def with_retry(func: Callable[..., T]) -> Callable[..., T]:
"""
Decorator to add retry logic to async functions.
Usage:
@with_retry
async def my_api_call():
...
"""
@functools.wraps(func)
async def wrapper(*args, **kwargs):
return await RetryStrategy.execute_with_retry(func, *args, **kwargs)
return wrapper