[ADD]Three party synchronization
1. Three party web website data access - Web site synchronization Building a knowledge base by crawling web page data in batches through web crawlers Web site synchronization utilizes crawler technology, which can automatically capture all websites under the same domain name through a single entry website. Currently, it supports up to 200 subpages. For compliance and security reasons, only static site crawling is supported, mainly used for quickly building knowledge bases on various document sites. 2. Feishu Knowledge Base By configuring Feishu document permissions, a knowledge base can be built using Feishu documents, and the documents will not undergo secondary storage 3. Language Bird Knowledge Base You can configure the permissions of the language bird document to build a knowledge base using the language bird document, and the document will not undergo secondary storage
This commit is contained in:
0
api/app/core/rag/crawler/__init__.py
Normal file
0
api/app/core/rag/crawler/__init__.py
Normal file
89
api/app/core/rag/crawler/__main__.py
Normal file
89
api/app/core/rag/crawler/__main__.py
Normal file
@@ -0,0 +1,89 @@
|
||||
"""Command-line interface for web crawler."""
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
from app.core.rag.crawler.web_crawler import WebCrawler
|
||||
|
||||
|
||||
def setup_logging(verbose: bool = False):
|
||||
"""Set up logging configuration."""
|
||||
level = logging.DEBUG if verbose else logging.INFO
|
||||
logging.basicConfig(
|
||||
level=level,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def main(entry_url: str,
|
||||
max_pages: int = 200,
|
||||
delay_seconds: float = 1.0,
|
||||
timeout_seconds: int = 10,
|
||||
user_agent: str = "KnowledgeBaseCrawler/1.0"):
|
||||
"""Main entry point for the crawler."""
|
||||
# Create crawler
|
||||
crawler = WebCrawler(
|
||||
entry_url=entry_url,
|
||||
max_pages=max_pages,
|
||||
delay_seconds=delay_seconds,
|
||||
timeout_seconds=timeout_seconds,
|
||||
user_agent=user_agent
|
||||
)
|
||||
|
||||
# Crawl and collect documents
|
||||
documents = []
|
||||
try:
|
||||
for doc in crawler.crawl():
|
||||
print(f"\n{'=' * 80}")
|
||||
print(f"URL: {doc.url}")
|
||||
print(f"Title: {doc.title}")
|
||||
print(f"Content Length: {doc.content_length} characters")
|
||||
print(f"Word Count: {doc.metadata.get('word_count', 0)} words")
|
||||
print(f"{'=' * 80}\n")
|
||||
|
||||
documents.append({
|
||||
'url': doc.url,
|
||||
'title': doc.title,
|
||||
'content': doc.content,
|
||||
'content_length': doc.content_length,
|
||||
'crawl_timestamp': doc.crawl_timestamp.isoformat(),
|
||||
'http_status': doc.http_status,
|
||||
'metadata': doc.metadata
|
||||
})
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nCrawl interrupted by user.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n\nError during crawl: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Get summary
|
||||
summary = crawler.get_summary()
|
||||
print(f"\n{'=' * 80}")
|
||||
print("CRAWL SUMMARY")
|
||||
print(f"{'=' * 80}")
|
||||
print(f"Total Pages Processed: {summary.total_pages_processed}")
|
||||
print(f"Total Errors: {summary.total_errors}")
|
||||
print(f"Total Skipped: {summary.total_skipped}")
|
||||
print(f"Total URLs Discovered: {summary.total_urls_discovered}")
|
||||
print(f"Duration: {summary.duration_seconds:.2f} seconds")
|
||||
print(f"documents: {documents}")
|
||||
|
||||
if summary.error_breakdown:
|
||||
print(f"\nError Breakdown:")
|
||||
for error_type, count in summary.error_breakdown.items():
|
||||
print(f" {error_type}: {count}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
entry_url = "https://www.xxx.com"
|
||||
max_pages = 20
|
||||
delay_seconds = 1.0
|
||||
timeout_seconds = 10
|
||||
user_agent = "KnowledgeBaseCrawler/1.0"
|
||||
|
||||
main(entry_url, max_pages, delay_seconds, timeout_seconds, user_agent)
|
||||
233
api/app/core/rag/crawler/content_extractor.py
Normal file
233
api/app/core/rag/crawler/content_extractor.py
Normal file
@@ -0,0 +1,233 @@
|
||||
"""Content extractor for web crawler."""
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import logging
|
||||
|
||||
from app.core.rag.crawler.models import ExtractedContent
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ContentExtractor:
|
||||
"""Extract clean, readable text from HTML pages."""
|
||||
|
||||
# Tags to remove completely
|
||||
REMOVE_TAGS = ['script', 'style', 'nav', 'header', 'footer', 'aside']
|
||||
|
||||
# Tags that typically contain main content
|
||||
MAIN_CONTENT_TAGS = ['article', 'main']
|
||||
|
||||
# Content extraction tags
|
||||
CONTENT_TAGS = ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'td', 'th', 'section']
|
||||
|
||||
def is_static_content(self, html: str) -> bool:
|
||||
"""
|
||||
Determine if the HTML represents static content.
|
||||
|
||||
Detects JavaScript-rendered content by checking for minimal body
|
||||
with heavy script tag presence.
|
||||
|
||||
Args:
|
||||
html: Raw HTML string
|
||||
|
||||
Returns:
|
||||
bool: True if static, False if JavaScript-rendered
|
||||
"""
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
# Count script tags
|
||||
script_tags = soup.find_all('script')
|
||||
script_count = len(script_tags)
|
||||
|
||||
# Get body content (excluding scripts and styles)
|
||||
body = soup.find('body')
|
||||
if not body:
|
||||
return False
|
||||
|
||||
# Remove scripts and styles temporarily for text check
|
||||
for tag in body.find_all(['script', 'style']):
|
||||
tag.decompose()
|
||||
|
||||
# Get text content
|
||||
text = body.get_text(strip=True)
|
||||
text_length = len(text)
|
||||
|
||||
# If there's very little text but many scripts, likely JS-rendered
|
||||
if script_count > 5 and text_length < 200:
|
||||
logger.warning("Detected JavaScript-rendered content (many scripts, little text)")
|
||||
return False
|
||||
|
||||
# If there's no meaningful text, likely JS-rendered
|
||||
if text_length < 50:
|
||||
logger.warning("Detected JavaScript-rendered content (minimal text)")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking if content is static: {e}")
|
||||
return True # Assume static on error
|
||||
|
||||
def extract(self, html: str, url: str) -> ExtractedContent:
|
||||
"""
|
||||
Extract clean text content from HTML.
|
||||
|
||||
Args:
|
||||
html: Raw HTML string
|
||||
url: Source URL (for context)
|
||||
|
||||
Returns:
|
||||
ExtractedContent: Contains title, text, metadata
|
||||
"""
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
# Check if content is static
|
||||
is_static = self.is_static_content(html)
|
||||
|
||||
# Extract title
|
||||
title = self._extract_title(soup)
|
||||
|
||||
# Remove unwanted tags
|
||||
for tag_name in self.REMOVE_TAGS:
|
||||
for tag in soup.find_all(tag_name):
|
||||
tag.decompose()
|
||||
|
||||
# Extract main content
|
||||
text = self._extract_main_content(soup)
|
||||
|
||||
# Normalize whitespace
|
||||
text = self._normalize_whitespace(text)
|
||||
|
||||
# Count words
|
||||
word_count = len(text.split())
|
||||
|
||||
logger.info(f"Extracted {word_count} words from {url}")
|
||||
|
||||
return ExtractedContent(
|
||||
title=title,
|
||||
text=text,
|
||||
is_static=is_static,
|
||||
word_count=word_count,
|
||||
metadata={'url': url}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting content from {url}: {e}")
|
||||
return ExtractedContent(
|
||||
title=url,
|
||||
text="",
|
||||
is_static=False,
|
||||
word_count=0,
|
||||
metadata={'url': url, 'error': str(e)}
|
||||
)
|
||||
|
||||
def _extract_title(self, soup: BeautifulSoup) -> str:
|
||||
"""
|
||||
Extract title from HTML.
|
||||
|
||||
Tries <title> tag first, then first <h1>.
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup object
|
||||
|
||||
Returns:
|
||||
str: Page title
|
||||
"""
|
||||
# Try <title> tag
|
||||
title_tag = soup.find('title')
|
||||
if title_tag and title_tag.string:
|
||||
return title_tag.string.strip()
|
||||
|
||||
# Try first <h1>
|
||||
h1_tag = soup.find('h1')
|
||||
if h1_tag:
|
||||
return h1_tag.get_text(strip=True)
|
||||
|
||||
# Default to empty string
|
||||
return ""
|
||||
|
||||
def _extract_main_content(self, soup: BeautifulSoup) -> str:
|
||||
"""
|
||||
Extract main content from HTML.
|
||||
|
||||
Prioritizes semantic HTML5 elements like <article> and <main>.
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup object
|
||||
|
||||
Returns:
|
||||
str: Extracted text content
|
||||
"""
|
||||
# Try to find main content area
|
||||
main_content = None
|
||||
|
||||
# Priority 1: <article> or <main> tags
|
||||
for tag_name in self.MAIN_CONTENT_TAGS:
|
||||
main_content = soup.find(tag_name)
|
||||
if main_content:
|
||||
logger.debug(f"Found main content in <{tag_name}> tag")
|
||||
break
|
||||
|
||||
# Priority 2: div with role="main"
|
||||
if not main_content:
|
||||
main_content = soup.find('div', role='main')
|
||||
if main_content:
|
||||
logger.debug("Found main content in div[role='main']")
|
||||
|
||||
# Priority 3: Common class/id patterns
|
||||
if not main_content:
|
||||
for pattern in ['content', 'main', 'article', 'post']:
|
||||
main_content = soup.find(['div', 'section'], class_=re.compile(pattern, re.I))
|
||||
if main_content:
|
||||
logger.debug(f"Found main content with class pattern '{pattern}'")
|
||||
break
|
||||
|
||||
main_content = soup.find(['div', 'section'], id=re.compile(pattern, re.I))
|
||||
if main_content:
|
||||
logger.debug(f"Found main content with id pattern '{pattern}'")
|
||||
break
|
||||
|
||||
# Fallback: use body
|
||||
if not main_content:
|
||||
main_content = soup.find('body')
|
||||
logger.debug("Using <body> as main content (no specific content area found)")
|
||||
|
||||
# Extract text from content tags
|
||||
if main_content:
|
||||
text_parts = []
|
||||
for tag in main_content.find_all(self.CONTENT_TAGS):
|
||||
text = tag.get_text(strip=True)
|
||||
if text:
|
||||
text_parts.append(text)
|
||||
|
||||
return '\n'.join(text_parts)
|
||||
|
||||
return ""
|
||||
|
||||
def _normalize_whitespace(self, text: str) -> str:
|
||||
"""
|
||||
Normalize whitespace in text.
|
||||
|
||||
- Collapse multiple spaces to single space
|
||||
- Reduce excessive newlines to maximum 2
|
||||
- Strip leading/trailing whitespace
|
||||
|
||||
Args:
|
||||
text: Text to normalize
|
||||
|
||||
Returns:
|
||||
str: Normalized text
|
||||
"""
|
||||
# Collapse multiple spaces to single space
|
||||
text = re.sub(r' +', ' ', text)
|
||||
|
||||
# Reduce excessive newlines to maximum 2
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
|
||||
# Strip leading/trailing whitespace
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
302
api/app/core/rag/crawler/http_fetcher.py
Normal file
302
api/app/core/rag/crawler/http_fetcher.py
Normal file
@@ -0,0 +1,302 @@
|
||||
"""HTTP fetcher for web crawler."""
|
||||
|
||||
import requests
|
||||
import time
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional, Dict
|
||||
|
||||
|
||||
from app.core.rag.crawler.models import FetchResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HTTPFetcher:
|
||||
"""Handle HTTP requests with retries, error handling, and response validation."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
timeout: int = 10,
|
||||
max_retries: int = 3,
|
||||
user_agent: str = "KnowledgeBaseCrawler/1.0"
|
||||
):
|
||||
"""
|
||||
Initialize HTTP fetcher.
|
||||
|
||||
Args:
|
||||
timeout: Request timeout in seconds
|
||||
max_retries: Maximum number of retry attempts
|
||||
user_agent: User-Agent header value
|
||||
"""
|
||||
self.timeout = timeout
|
||||
self.max_retries = max_retries
|
||||
self.user_agent = user_agent
|
||||
|
||||
# Create session for connection pooling
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
'User-Agent': user_agent
|
||||
})
|
||||
|
||||
def fetch(self, url: str) -> FetchResult:
|
||||
"""
|
||||
Fetch a URL with retry logic and error handling.
|
||||
|
||||
Args:
|
||||
url: URL to fetch
|
||||
|
||||
Returns:
|
||||
FetchResult: Contains status_code, content, headers, error info
|
||||
"""
|
||||
last_error = None
|
||||
|
||||
for attempt in range(self.max_retries):
|
||||
try:
|
||||
# Calculate backoff delay for retries
|
||||
if attempt > 0:
|
||||
backoff_delay = 2 ** (attempt - 1) # 1s, 2s, 4s
|
||||
logger.info(f"Retry attempt {attempt + 1}/{self.max_retries} for {url} after {backoff_delay}s")
|
||||
time.sleep(backoff_delay)
|
||||
|
||||
# Make HTTP request
|
||||
response = self.session.get(
|
||||
url,
|
||||
timeout=self.timeout,
|
||||
allow_redirects=True
|
||||
)
|
||||
|
||||
# Handle different status codes
|
||||
if response.status_code == 429:
|
||||
# Too Many Requests - backoff and retry
|
||||
logger.warning(f"429 Too Many Requests for {url}, backing off")
|
||||
if attempt < self.max_retries - 1:
|
||||
continue
|
||||
|
||||
if response.status_code == 503:
|
||||
# Service Unavailable - pause and retry
|
||||
logger.warning(f"503 Service Unavailable for {url}")
|
||||
if attempt < self.max_retries - 1:
|
||||
time.sleep(5) # Longer pause for 503
|
||||
continue
|
||||
|
||||
# Success or client error (don't retry 4xx except 429)
|
||||
if 200 <= response.status_code < 300:
|
||||
logger.info(f"Successfully fetched {url} (status: {response.status_code})")
|
||||
|
||||
# Get correctly encoded content
|
||||
content = self._get_decoded_content(response)
|
||||
|
||||
return FetchResult(
|
||||
url=url,
|
||||
final_url=response.url,
|
||||
status_code=response.status_code,
|
||||
content=content,
|
||||
headers=dict(response.headers),
|
||||
error=None,
|
||||
success=True
|
||||
)
|
||||
elif response.status_code == 404:
|
||||
logger.info(f"404 Not Found: {url}")
|
||||
return FetchResult(
|
||||
url=url,
|
||||
final_url=response.url,
|
||||
status_code=response.status_code,
|
||||
content=None,
|
||||
headers=dict(response.headers),
|
||||
error="Not Found",
|
||||
success=False
|
||||
)
|
||||
elif 400 <= response.status_code < 500:
|
||||
logger.warning(f"Client error {response.status_code} for {url}")
|
||||
return FetchResult(
|
||||
url=url,
|
||||
final_url=response.url,
|
||||
status_code=response.status_code,
|
||||
content=None,
|
||||
headers=dict(response.headers),
|
||||
error=f"Client error: {response.status_code}",
|
||||
success=False
|
||||
)
|
||||
elif 500 <= response.status_code < 600:
|
||||
logger.error(f"Server error {response.status_code} for {url}")
|
||||
last_error = f"Server error: {response.status_code}"
|
||||
if attempt < self.max_retries - 1:
|
||||
continue
|
||||
return FetchResult(
|
||||
url=url,
|
||||
final_url=url,
|
||||
status_code=response.status_code,
|
||||
content=None,
|
||||
headers={},
|
||||
error=last_error,
|
||||
success=False
|
||||
)
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
last_error = "Request timeout"
|
||||
logger.warning(f"Timeout fetching {url} (attempt {attempt + 1}/{self.max_retries})")
|
||||
if attempt >= self.max_retries - 1:
|
||||
break
|
||||
continue
|
||||
|
||||
except requests.exceptions.SSLError as e:
|
||||
last_error = f"SSL/TLS error: {str(e)}"
|
||||
logger.error(f"SSL/TLS error for {url}: {e}")
|
||||
return FetchResult(
|
||||
url=url,
|
||||
final_url=url,
|
||||
status_code=0,
|
||||
content=None,
|
||||
headers={},
|
||||
error=last_error,
|
||||
success=False
|
||||
)
|
||||
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
last_error = f"Connection error: {str(e)}"
|
||||
logger.warning(f"Connection error for {url} (attempt {attempt + 1}/{self.max_retries}): {e}")
|
||||
if attempt >= self.max_retries - 1:
|
||||
break
|
||||
continue
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
last_error = f"Request error: {str(e)}"
|
||||
logger.error(f"Request error for {url}: {e}")
|
||||
if attempt >= self.max_retries - 1:
|
||||
break
|
||||
continue
|
||||
|
||||
# All retries exhausted
|
||||
logger.error(f"Failed to fetch {url} after {self.max_retries} attempts: {last_error}")
|
||||
return FetchResult(
|
||||
url=url,
|
||||
final_url=url,
|
||||
status_code=0,
|
||||
content=None,
|
||||
headers={},
|
||||
error=last_error or "Unknown error",
|
||||
success=False
|
||||
)
|
||||
|
||||
def _get_decoded_content(self, response) -> str:
|
||||
"""
|
||||
Get correctly decoded content from response.
|
||||
|
||||
Handles encoding detection and fallback strategies:
|
||||
1. Try encoding from HTML meta tags
|
||||
2. Try response.encoding (from Content-Type header or detected)
|
||||
3. Try UTF-8
|
||||
4. Try common encodings (GB2312, GBK for Chinese, etc.)
|
||||
5. Fall back to latin-1 with error replacement
|
||||
|
||||
Args:
|
||||
response: requests.Response object
|
||||
|
||||
Returns:
|
||||
str: Decoded content
|
||||
"""
|
||||
# Try to detect encoding from HTML meta tags
|
||||
meta_encoding = self._detect_encoding_from_meta(response.content)
|
||||
if meta_encoding:
|
||||
try:
|
||||
content = response.content.decode(meta_encoding)
|
||||
logger.info(f"Successfully decoded with meta tag encoding: {meta_encoding}")
|
||||
return content
|
||||
except (UnicodeDecodeError, LookupError) as e:
|
||||
logger.warning(f"Failed to decode with meta encoding {meta_encoding}: {e}")
|
||||
|
||||
# Try response.encoding (from Content-Type header or detected by requests)
|
||||
if response.encoding and response.encoding.lower() != 'iso-8859-1':
|
||||
# Note: requests defaults to ISO-8859-1 if no charset in Content-Type,
|
||||
# so we skip it here and try UTF-8 first
|
||||
try:
|
||||
return response.text
|
||||
except (UnicodeDecodeError, LookupError) as e:
|
||||
logger.warning(f"Failed to decode with detected encoding {response.encoding}: {e}")
|
||||
|
||||
# Try UTF-8 first (most common)
|
||||
try:
|
||||
return response.content.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
logger.debug("UTF-8 decoding failed, trying other encodings")
|
||||
|
||||
# Try common encodings for different languages
|
||||
encodings_to_try = [
|
||||
'gbk', # Chinese (Simplified)
|
||||
'gb2312', # Chinese (Simplified, older)
|
||||
'gb18030', # Chinese (Simplified, extended)
|
||||
'big5', # Chinese (Traditional)
|
||||
'shift_jis', # Japanese
|
||||
'euc-jp', # Japanese
|
||||
'euc-kr', # Korean
|
||||
'iso-8859-1', # Western European
|
||||
'windows-1252', # Windows Western European
|
||||
'windows-1251', # Cyrillic
|
||||
]
|
||||
|
||||
for encoding in encodings_to_try:
|
||||
try:
|
||||
content = response.content.decode(encoding)
|
||||
logger.info(f"Successfully decoded with {encoding}")
|
||||
return content
|
||||
except (UnicodeDecodeError, LookupError):
|
||||
continue
|
||||
|
||||
# Last resort: use latin-1 with error replacement
|
||||
logger.warning("All encoding attempts failed, using latin-1 with error replacement")
|
||||
return response.content.decode('latin-1', errors='replace')
|
||||
|
||||
def _detect_encoding_from_meta(self, content: bytes) -> Optional[str]:
|
||||
"""
|
||||
Detect encoding from HTML meta tags.
|
||||
|
||||
Looks for:
|
||||
- <meta charset="...">
|
||||
- <meta http-equiv="Content-Type" content="...; charset=...">
|
||||
|
||||
Args:
|
||||
content: Raw response content (bytes)
|
||||
|
||||
Returns:
|
||||
Optional[str]: Detected encoding or None
|
||||
"""
|
||||
try:
|
||||
# Only check first 2KB for performance
|
||||
head = content[:2048]
|
||||
|
||||
# Try to decode as ASCII/Latin-1 to search for meta tags
|
||||
try:
|
||||
head_str = head.decode('ascii', errors='ignore')
|
||||
except:
|
||||
head_str = head.decode('latin-1', errors='ignore')
|
||||
|
||||
# Look for <meta charset="...">
|
||||
charset_match = re.search(
|
||||
r'<meta[^>]+charset=["\']?([a-zA-Z0-9_-]+)',
|
||||
head_str,
|
||||
re.IGNORECASE
|
||||
)
|
||||
if charset_match:
|
||||
encoding = charset_match.group(1).lower()
|
||||
logger.debug(f"Found charset in meta tag: {encoding}")
|
||||
return encoding
|
||||
|
||||
# Look for <meta http-equiv="Content-Type" content="...; charset=...">
|
||||
content_type_match = re.search(
|
||||
r'<meta[^>]+http-equiv=["\']?content-type["\']?[^>]+content=["\']([^"\']+)',
|
||||
head_str,
|
||||
re.IGNORECASE
|
||||
)
|
||||
if content_type_match:
|
||||
content_value = content_type_match.group(1)
|
||||
charset_match = re.search(r'charset=([a-zA-Z0-9_-]+)', content_value, re.IGNORECASE)
|
||||
if charset_match:
|
||||
encoding = charset_match.group(1).lower()
|
||||
logger.debug(f"Found charset in Content-Type meta: {encoding}")
|
||||
return encoding
|
||||
|
||||
except Exception as e:
|
||||
logger.debug(f"Error detecting encoding from meta tags: {e}")
|
||||
|
||||
return None
|
||||
52
api/app/core/rag/crawler/models.py
Normal file
52
api/app/core/rag/crawler/models.py
Normal file
@@ -0,0 +1,52 @@
|
||||
"""Data models for web crawler."""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawledDocument:
|
||||
"""Represents a successfully processed web page with extracted content."""
|
||||
url: str
|
||||
title: str
|
||||
content: str
|
||||
content_length: int
|
||||
crawl_timestamp: datetime
|
||||
http_status: int
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class FetchResult:
|
||||
"""Represents the result of an HTTP fetch operation."""
|
||||
url: str
|
||||
final_url: str
|
||||
status_code: int
|
||||
content: Optional[str]
|
||||
headers: Dict[str, str]
|
||||
error: Optional[str]
|
||||
success: bool
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExtractedContent:
|
||||
"""Represents content extracted from HTML."""
|
||||
title: str
|
||||
text: str
|
||||
is_static: bool
|
||||
word_count: int
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlSummary:
|
||||
"""Represents statistics from a completed crawl."""
|
||||
total_pages_processed: int
|
||||
total_errors: int
|
||||
total_skipped: int
|
||||
total_urls_discovered: int
|
||||
start_time: datetime
|
||||
end_time: datetime
|
||||
duration_seconds: float
|
||||
error_breakdown: Dict[str, int] = field(default_factory=dict)
|
||||
57
api/app/core/rag/crawler/rate_limiter.py
Normal file
57
api/app/core/rag/crawler/rate_limiter.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""Rate limiter for web crawler."""
|
||||
|
||||
import time
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
"""Enforce delays between requests to be polite to servers."""
|
||||
|
||||
def __init__(self, delay_seconds: float = 1.0):
|
||||
"""
|
||||
Initialize rate limiter.
|
||||
|
||||
Args:
|
||||
delay_seconds: Minimum delay between requests
|
||||
"""
|
||||
self.delay_seconds = delay_seconds
|
||||
self.last_request_time = 0.0
|
||||
self.max_delay = 60.0 # Cap maximum delay at 60 seconds
|
||||
|
||||
def wait(self):
|
||||
"""
|
||||
Block until enough time has passed since last request.
|
||||
Respects the configured delay.
|
||||
"""
|
||||
current_time = time.time()
|
||||
elapsed = current_time - self.last_request_time
|
||||
|
||||
if elapsed < self.delay_seconds:
|
||||
sleep_time = self.delay_seconds - elapsed
|
||||
logger.debug(f"Rate limiting: sleeping for {sleep_time:.2f} seconds")
|
||||
time.sleep(sleep_time)
|
||||
|
||||
self.last_request_time = time.time()
|
||||
|
||||
def set_delay(self, delay_seconds: float):
|
||||
"""
|
||||
Update the delay (useful for respecting Crawl-delay from robots.txt).
|
||||
|
||||
Args:
|
||||
delay_seconds: New delay in seconds
|
||||
"""
|
||||
self.delay_seconds = min(delay_seconds, self.max_delay)
|
||||
logger.info(f"Rate limiter delay updated to {self.delay_seconds} seconds")
|
||||
|
||||
def backoff(self, multiplier: float = 2.0):
|
||||
"""
|
||||
Increase delay exponentially for backoff scenarios (429, 503 responses).
|
||||
|
||||
Args:
|
||||
multiplier: Factor to multiply current delay by
|
||||
"""
|
||||
old_delay = self.delay_seconds
|
||||
self.delay_seconds = min(self.delay_seconds * multiplier, self.max_delay)
|
||||
logger.warning(f"Rate limiter backing off: {old_delay:.2f}s -> {self.delay_seconds:.2f}s")
|
||||
118
api/app/core/rag/crawler/robots_parser.py
Normal file
118
api/app/core/rag/crawler/robots_parser.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""Robots.txt parser for web crawler."""
|
||||
|
||||
from urllib.robotparser import RobotFileParser
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from typing import Optional
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RobotsParser:
|
||||
"""Parse and check robots.txt compliance for URLs."""
|
||||
|
||||
def __init__(self, user_agent: str, timeout: int = 10):
|
||||
"""
|
||||
Initialize robots.txt parser.
|
||||
|
||||
Args:
|
||||
user_agent: User agent string to check permissions for
|
||||
timeout: Timeout for fetching robots.txt
|
||||
"""
|
||||
self.user_agent = user_agent
|
||||
self.timeout = timeout
|
||||
self._parsers = {} # Cache parsers by domain
|
||||
|
||||
def _get_robots_url(self, url: str) -> str:
|
||||
"""
|
||||
Get the robots.txt URL for a given URL.
|
||||
|
||||
Args:
|
||||
url: URL to get robots.txt for
|
||||
|
||||
Returns:
|
||||
str: robots.txt URL
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
|
||||
return robots_url
|
||||
|
||||
def _get_parser(self, url: str) -> RobotFileParser:
|
||||
"""
|
||||
Get or create a RobotFileParser for the domain.
|
||||
|
||||
Args:
|
||||
url: URL to get parser for
|
||||
|
||||
Returns:
|
||||
RobotFileParser: Parser for the domain
|
||||
"""
|
||||
robots_url = self._get_robots_url(url)
|
||||
|
||||
# Return cached parser if available
|
||||
if robots_url in self._parsers:
|
||||
return self._parsers[robots_url]
|
||||
|
||||
# Create new parser
|
||||
parser = RobotFileParser()
|
||||
parser.set_url(robots_url)
|
||||
|
||||
try:
|
||||
# Fetch and parse robots.txt
|
||||
parser.read()
|
||||
logger.info(f"Successfully fetched robots.txt from {robots_url}")
|
||||
except Exception as e:
|
||||
# If robots.txt cannot be fetched, assume all URLs are allowed
|
||||
logger.warning(f"Could not fetch robots.txt from {robots_url}: {e}. Assuming all URLs allowed.")
|
||||
# Create a permissive parser
|
||||
parser = RobotFileParser()
|
||||
parser.parse([]) # Empty robots.txt allows everything
|
||||
|
||||
# Cache the parser
|
||||
self._parsers[robots_url] = parser
|
||||
return parser
|
||||
|
||||
def can_fetch(self, url: str) -> bool:
|
||||
"""
|
||||
Check if the given URL can be fetched according to robots.txt.
|
||||
|
||||
Args:
|
||||
url: URL to check
|
||||
|
||||
Returns:
|
||||
bool: True if allowed, False if disallowed
|
||||
"""
|
||||
try:
|
||||
parser = self._get_parser(url)
|
||||
allowed = parser.can_fetch(self.user_agent, url)
|
||||
|
||||
if not allowed:
|
||||
logger.info(f"URL disallowed by robots.txt: {url}")
|
||||
|
||||
return allowed
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking robots.txt for {url}: {e}")
|
||||
# On error, assume allowed
|
||||
return True
|
||||
|
||||
def get_crawl_delay(self, url: str) -> Optional[float]:
|
||||
"""
|
||||
Get the Crawl-delay directive from robots.txt if present.
|
||||
|
||||
Args:
|
||||
url: URL to get crawl delay for
|
||||
|
||||
Returns:
|
||||
Optional[float]: Delay in seconds, or None if not specified
|
||||
"""
|
||||
try:
|
||||
parser = self._get_parser(url)
|
||||
delay = parser.crawl_delay(self.user_agent)
|
||||
|
||||
if delay is not None:
|
||||
logger.info(f"Crawl-delay from robots.txt: {delay} seconds")
|
||||
|
||||
return delay
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting crawl delay for {url}: {e}")
|
||||
return None
|
||||
171
api/app/core/rag/crawler/url_normalizer.py
Normal file
171
api/app/core/rag/crawler/url_normalizer.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""URL normalization and validation for web crawler."""
|
||||
|
||||
from typing import Optional, List
|
||||
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode, urljoin
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
class URLNormalizer:
|
||||
"""Normalize and validate URLs for deduplication and domain checking."""
|
||||
|
||||
# Common tracking parameters to remove
|
||||
TRACKING_PARAMS = {
|
||||
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
|
||||
'fbclid', 'gclid', 'msclkid', '_ga', 'mc_cid', 'mc_eid'
|
||||
}
|
||||
|
||||
def __init__(self, base_domain: str):
|
||||
"""
|
||||
Initialize URL normalizer with base domain.
|
||||
|
||||
Args:
|
||||
base_domain: The domain to use for same-domain checks
|
||||
"""
|
||||
parsed = urlparse(base_domain)
|
||||
self.base_domain = parsed.netloc.lower() # example.com:8000
|
||||
self.base_scheme = parsed.scheme or 'https' # https
|
||||
|
||||
def normalize(self, url: str) -> Optional[str]:
|
||||
"""
|
||||
Normalize a URL for deduplication.
|
||||
|
||||
Normalization rules:
|
||||
1. Convert domain to lowercase
|
||||
2. Remove fragments (#section)
|
||||
3. Remove default ports (80 for http, 443 for https)
|
||||
4. Remove trailing slashes (except for root)
|
||||
5. Sort query parameters alphabetically
|
||||
6. Remove common tracking parameters
|
||||
|
||||
Args:
|
||||
url: URL to normalize
|
||||
|
||||
Returns:
|
||||
Optional[str]: Normalized URL, or None if invalid
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
|
||||
# Validate scheme
|
||||
if parsed.scheme not in ('http', 'https'):
|
||||
return None
|
||||
|
||||
# Normalize domain to lowercase
|
||||
netloc = parsed.netloc.lower()
|
||||
|
||||
# Remove default ports
|
||||
if ':' in netloc:
|
||||
host, port = netloc.rsplit(':', 1)
|
||||
if (parsed.scheme == 'http' and port == '80') or \
|
||||
(parsed.scheme == 'https' and port == '443'):
|
||||
netloc = host
|
||||
|
||||
# Normalize path
|
||||
path = parsed.path
|
||||
# Remove trailing slash except for root
|
||||
if path != '/' and path.endswith('/'):
|
||||
path = path.rstrip('/')
|
||||
# Ensure path starts with /
|
||||
if not path:
|
||||
path = '/'
|
||||
|
||||
# Process query parameters
|
||||
query = ''
|
||||
if parsed.query:
|
||||
# Parse query parameters
|
||||
params = parse_qs(parsed.query, keep_blank_values=True)
|
||||
# Remove tracking parameters
|
||||
filtered_params = {
|
||||
k: v for k, v in params.items()
|
||||
if k not in self.TRACKING_PARAMS
|
||||
}
|
||||
# Sort parameters alphabetically
|
||||
if filtered_params:
|
||||
sorted_params = sorted(filtered_params.items())
|
||||
query = urlencode(sorted_params, doseq=True)
|
||||
|
||||
# Reconstruct URL without fragment
|
||||
normalized = urlunparse((
|
||||
parsed.scheme,
|
||||
netloc,
|
||||
path,
|
||||
parsed.params,
|
||||
query,
|
||||
'' # Remove fragment
|
||||
))
|
||||
|
||||
return normalized
|
||||
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def is_same_domain(self, url: str) -> bool:
|
||||
"""
|
||||
Check if URL belongs to the same domain as base_domain.
|
||||
|
||||
Args:
|
||||
url: URL to check
|
||||
|
||||
Returns:
|
||||
bool: True if same domain, False otherwise
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
|
||||
# Remove port if present
|
||||
if ':' in domain:
|
||||
domain = domain.split(':')[0]
|
||||
|
||||
# Check if domains match
|
||||
return domain == self.base_domain or domain == self.base_domain.split(':')[0]
|
||||
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def extract_links(self, html: str, base_url: str) -> List[str]:
|
||||
"""
|
||||
Extract and normalize all links from HTML.
|
||||
|
||||
Args:
|
||||
html: HTML content
|
||||
base_url: Base URL for resolving relative links
|
||||
|
||||
Returns:
|
||||
List[str]: List of normalized absolute URLs
|
||||
"""
|
||||
links = []
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
|
||||
# Find all anchor tags
|
||||
for anchor in soup.find_all('a', href=True):
|
||||
href = anchor['href']
|
||||
|
||||
# Skip empty hrefs
|
||||
if not href or href.strip() == '':
|
||||
continue
|
||||
|
||||
# Skip javascript: and mailto: links
|
||||
if href.startswith(('javascript:', 'mailto:', 'tel:')):
|
||||
continue
|
||||
|
||||
normalized_url = None
|
||||
# Check if href starts with http/https (absolute URL)
|
||||
if href.startswith(('http://', 'https://')):
|
||||
if self.is_same_domain(href):
|
||||
normalized_url = self.normalize(href)
|
||||
else:
|
||||
# Convert relative URL to absolute
|
||||
absolute_url = urljoin(base_url, href)
|
||||
# Normalize the URL
|
||||
normalized_url = self.normalize(absolute_url)
|
||||
|
||||
if normalized_url:
|
||||
links.append(normalized_url)
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return links
|
||||
215
api/app/core/rag/crawler/web_crawler.py
Normal file
215
api/app/core/rag/crawler/web_crawler.py
Normal file
@@ -0,0 +1,215 @@
|
||||
"""Main web crawler orchestrator."""
|
||||
|
||||
from collections import deque
|
||||
from datetime import datetime
|
||||
from typing import Iterator, Optional, List, Set
|
||||
from urllib.parse import urlparse
|
||||
import logging
|
||||
|
||||
from app.core.rag.crawler.url_normalizer import URLNormalizer
|
||||
from app.core.rag.crawler.robots_parser import RobotsParser
|
||||
from app.core.rag.crawler.rate_limiter import RateLimiter
|
||||
from app.core.rag.crawler.http_fetcher import HTTPFetcher
|
||||
from app.core.rag.crawler.content_extractor import ContentExtractor
|
||||
from app.core.rag.crawler.models import CrawledDocument, CrawlSummary
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WebCrawler:
|
||||
"""Main orchestrator for web crawling."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
entry_url: str,
|
||||
max_pages: int = 200,
|
||||
delay_seconds: float = 1.0,
|
||||
timeout_seconds: int = 10,
|
||||
user_agent: str = "KnowledgeBaseCrawler/1.0",
|
||||
include_patterns: Optional[List[str]] = None,
|
||||
exclude_patterns: Optional[List[str]] = None,
|
||||
content_extractor: Optional[ContentExtractor] = None
|
||||
):
|
||||
"""
|
||||
Initialize the web crawler.
|
||||
|
||||
Args:
|
||||
entry_url: Starting URL for the crawl
|
||||
max_pages: Maximum number of pages to crawl (default: 200)
|
||||
delay_seconds: Delay between requests in seconds (default: 1.0)
|
||||
timeout_seconds: HTTP request timeout (default: 10)
|
||||
user_agent: User-Agent header string
|
||||
include_patterns: List of regex patterns for URLs to include
|
||||
exclude_patterns: List of regex patterns for URLs to exclude
|
||||
content_extractor: Custom content extractor (optional)
|
||||
"""
|
||||
# Validate entry URL
|
||||
parsed = urlparse(entry_url)
|
||||
if not parsed.scheme or not parsed.netloc:
|
||||
raise ValueError(f"Invalid entry URL: {entry_url}")
|
||||
|
||||
self.entry_url = entry_url
|
||||
self.max_pages = max_pages
|
||||
self.user_agent = user_agent
|
||||
|
||||
# Extract domain from entry URL
|
||||
self.domain = parsed.netloc
|
||||
|
||||
# Initialize components
|
||||
self.url_normalizer = URLNormalizer(entry_url)
|
||||
self.robots_parser = RobotsParser(user_agent, timeout_seconds)
|
||||
self.rate_limiter = RateLimiter(delay_seconds)
|
||||
self.http_fetcher = HTTPFetcher(timeout_seconds, max_retries=3, user_agent=user_agent)
|
||||
self.content_extractor = content_extractor or ContentExtractor()
|
||||
|
||||
# State management
|
||||
self.url_queue: deque = deque()
|
||||
self.visited_urls: Set[str] = set()
|
||||
self.pages_processed = 0
|
||||
|
||||
# Statistics
|
||||
self.stats = {
|
||||
'success': 0,
|
||||
'errors': 0,
|
||||
'skipped': 0,
|
||||
'urls_discovered': 0,
|
||||
'error_breakdown': {}
|
||||
}
|
||||
self.start_time: Optional[datetime] = None
|
||||
self.end_time: Optional[datetime] = None
|
||||
|
||||
def crawl(self) -> Iterator[CrawledDocument]:
|
||||
"""
|
||||
Execute the crawl and yield documents as they are processed.
|
||||
|
||||
Yields:
|
||||
CrawledDocument: Structured document with extracted content
|
||||
"""
|
||||
logger.info(f"Starting crawl from {self.entry_url} (max_pages: {self.max_pages})")
|
||||
self.start_time = datetime.now()
|
||||
|
||||
# Add entry URL to queue
|
||||
normalized_entry = self.url_normalizer.normalize(self.entry_url)
|
||||
if normalized_entry:
|
||||
self.url_queue.append(normalized_entry)
|
||||
self.stats['urls_discovered'] += 1
|
||||
|
||||
# Check robots.txt and update rate limiter if needed
|
||||
crawl_delay = self.robots_parser.get_crawl_delay(self.entry_url)
|
||||
if crawl_delay:
|
||||
self.rate_limiter.set_delay(crawl_delay)
|
||||
|
||||
# Main crawl loop
|
||||
while self.url_queue and self.pages_processed < self.max_pages:
|
||||
url = self.url_queue.popleft()
|
||||
|
||||
# Skip if already visited
|
||||
if url in self.visited_urls:
|
||||
continue
|
||||
|
||||
# Mark as visited
|
||||
self.visited_urls.add(url)
|
||||
|
||||
# Check robots.txt permission
|
||||
if not self.robots_parser.can_fetch(url):
|
||||
logger.info(f"Skipping {url} (disallowed by robots.txt)")
|
||||
self.stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Apply rate limiting
|
||||
self.rate_limiter.wait()
|
||||
|
||||
# Fetch URL
|
||||
logger.info(f"Fetching {url} ({self.pages_processed + 1}/{self.max_pages})")
|
||||
fetch_result = self.http_fetcher.fetch(url)
|
||||
|
||||
# Handle fetch errors
|
||||
if not fetch_result.success:
|
||||
self._record_error(fetch_result.error or "Unknown error")
|
||||
continue
|
||||
|
||||
# Check Content-Type
|
||||
content_type = fetch_result.headers.get('Content-Type', '').lower()
|
||||
if not any(substring in content_type for substring in ['text/html', 'application/xhtml+xml']):
|
||||
logger.warning(f"Skipping {url} (Content-Type: {content_type})")
|
||||
self.stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Extract content
|
||||
try:
|
||||
extracted = self.content_extractor.extract(fetch_result.content, url)
|
||||
|
||||
# Check if static content
|
||||
if not extracted.is_static:
|
||||
logger.warning(f"Skipping {url} (JavaScript-rendered content)")
|
||||
self.stats['skipped'] += 1
|
||||
continue
|
||||
|
||||
# Create document
|
||||
document = CrawledDocument(
|
||||
url=url,
|
||||
title=extracted.title,
|
||||
content=extracted.text,
|
||||
content_length=len(extracted.text),
|
||||
crawl_timestamp=datetime.now(),
|
||||
http_status=fetch_result.status_code,
|
||||
metadata={
|
||||
'word_count': extracted.word_count,
|
||||
'final_url': fetch_result.final_url
|
||||
}
|
||||
)
|
||||
|
||||
# Update statistics
|
||||
self.pages_processed += 1
|
||||
self.stats['success'] += 1
|
||||
|
||||
# Extract and queue links
|
||||
links = self.url_normalizer.extract_links(fetch_result.content, url)
|
||||
for link in links:
|
||||
if link not in self.visited_urls and self.url_normalizer.is_same_domain(link):
|
||||
if link not in self.url_queue:
|
||||
self.url_queue.append(link)
|
||||
self.stats['urls_discovered'] += 1
|
||||
|
||||
# Yield document
|
||||
yield document
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {url}: {e}")
|
||||
self._record_error(f"Processing error: {str(e)}")
|
||||
continue
|
||||
|
||||
self.end_time = datetime.now()
|
||||
logger.info(f"Crawl completed. Processed {self.pages_processed} pages.")
|
||||
|
||||
def get_summary(self) -> CrawlSummary:
|
||||
"""
|
||||
Get summary statistics after crawl completion.
|
||||
|
||||
Returns:
|
||||
CrawlSummary: Statistics including success/error/skip counts
|
||||
"""
|
||||
if not self.start_time:
|
||||
self.start_time = datetime.now()
|
||||
if not self.end_time:
|
||||
self.end_time = datetime.now()
|
||||
|
||||
duration = (self.end_time - self.start_time).total_seconds()
|
||||
|
||||
return CrawlSummary(
|
||||
total_pages_processed=self.stats['success'],
|
||||
total_errors=self.stats['errors'],
|
||||
total_skipped=self.stats['skipped'],
|
||||
total_urls_discovered=self.stats['urls_discovered'],
|
||||
start_time=self.start_time,
|
||||
end_time=self.end_time,
|
||||
duration_seconds=duration,
|
||||
error_breakdown=self.stats['error_breakdown']
|
||||
)
|
||||
|
||||
def _record_error(self, error: str):
|
||||
"""Record an error in statistics."""
|
||||
self.stats['errors'] += 1
|
||||
error_type = error.split(':')[0] if ':' in error else error
|
||||
self.stats['error_breakdown'][error_type] = \
|
||||
self.stats['error_breakdown'].get(error_type, 0) + 1
|
||||
Reference in New Issue
Block a user