[ADD]Three party synchronization

1. Three party web website data access - Web site synchronization
Building a knowledge base by crawling web page data in batches through web crawlers
Web site synchronization utilizes crawler technology, which can automatically capture all websites under the same domain name through a single entry website. Currently, it supports up to 200 subpages. For compliance and security reasons, only static site crawling is supported, mainly used for quickly building knowledge bases on various document sites.
2. Feishu Knowledge Base
By configuring Feishu document permissions, a knowledge base can be built using Feishu documents, and the documents will not undergo secondary storage
3. Language Bird Knowledge Base
You can configure the permissions of the language bird document to build a knowledge base using the language bird document, and the document will not undergo secondary storage
This commit is contained in:
lixiangcheng1
2026-02-06 12:18:40 +08:00
parent c1941809e9
commit db46c186aa
30 changed files with 3422 additions and 1 deletions

View File

View File

@@ -0,0 +1,89 @@
"""Command-line interface for web crawler."""
import argparse
import logging
import sys
from app.core.rag.crawler.web_crawler import WebCrawler
def setup_logging(verbose: bool = False):
"""Set up logging configuration."""
level = logging.DEBUG if verbose else logging.INFO
logging.basicConfig(
level=level,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
def main(entry_url: str,
max_pages: int = 200,
delay_seconds: float = 1.0,
timeout_seconds: int = 10,
user_agent: str = "KnowledgeBaseCrawler/1.0"):
"""Main entry point for the crawler."""
# Create crawler
crawler = WebCrawler(
entry_url=entry_url,
max_pages=max_pages,
delay_seconds=delay_seconds,
timeout_seconds=timeout_seconds,
user_agent=user_agent
)
# Crawl and collect documents
documents = []
try:
for doc in crawler.crawl():
print(f"\n{'=' * 80}")
print(f"URL: {doc.url}")
print(f"Title: {doc.title}")
print(f"Content Length: {doc.content_length} characters")
print(f"Word Count: {doc.metadata.get('word_count', 0)} words")
print(f"{'=' * 80}\n")
documents.append({
'url': doc.url,
'title': doc.title,
'content': doc.content,
'content_length': doc.content_length,
'crawl_timestamp': doc.crawl_timestamp.isoformat(),
'http_status': doc.http_status,
'metadata': doc.metadata
})
except KeyboardInterrupt:
print("\n\nCrawl interrupted by user.")
except Exception as e:
print(f"\n\nError during crawl: {e}")
sys.exit(1)
# Get summary
summary = crawler.get_summary()
print(f"\n{'=' * 80}")
print("CRAWL SUMMARY")
print(f"{'=' * 80}")
print(f"Total Pages Processed: {summary.total_pages_processed}")
print(f"Total Errors: {summary.total_errors}")
print(f"Total Skipped: {summary.total_skipped}")
print(f"Total URLs Discovered: {summary.total_urls_discovered}")
print(f"Duration: {summary.duration_seconds:.2f} seconds")
print(f"documents: {documents}")
if summary.error_breakdown:
print(f"\nError Breakdown:")
for error_type, count in summary.error_breakdown.items():
print(f" {error_type}: {count}")
if __name__ == '__main__':
entry_url = "https://www.xxx.com"
max_pages = 20
delay_seconds = 1.0
timeout_seconds = 10
user_agent = "KnowledgeBaseCrawler/1.0"
main(entry_url, max_pages, delay_seconds, timeout_seconds, user_agent)

View File

@@ -0,0 +1,233 @@
"""Content extractor for web crawler."""
from bs4 import BeautifulSoup
import re
import logging
from app.core.rag.crawler.models import ExtractedContent
logger = logging.getLogger(__name__)
class ContentExtractor:
"""Extract clean, readable text from HTML pages."""
# Tags to remove completely
REMOVE_TAGS = ['script', 'style', 'nav', 'header', 'footer', 'aside']
# Tags that typically contain main content
MAIN_CONTENT_TAGS = ['article', 'main']
# Content extraction tags
CONTENT_TAGS = ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'td', 'th', 'section']
def is_static_content(self, html: str) -> bool:
"""
Determine if the HTML represents static content.
Detects JavaScript-rendered content by checking for minimal body
with heavy script tag presence.
Args:
html: Raw HTML string
Returns:
bool: True if static, False if JavaScript-rendered
"""
try:
soup = BeautifulSoup(html, 'lxml')
# Count script tags
script_tags = soup.find_all('script')
script_count = len(script_tags)
# Get body content (excluding scripts and styles)
body = soup.find('body')
if not body:
return False
# Remove scripts and styles temporarily for text check
for tag in body.find_all(['script', 'style']):
tag.decompose()
# Get text content
text = body.get_text(strip=True)
text_length = len(text)
# If there's very little text but many scripts, likely JS-rendered
if script_count > 5 and text_length < 200:
logger.warning("Detected JavaScript-rendered content (many scripts, little text)")
return False
# If there's no meaningful text, likely JS-rendered
if text_length < 50:
logger.warning("Detected JavaScript-rendered content (minimal text)")
return False
return True
except Exception as e:
logger.error(f"Error checking if content is static: {e}")
return True # Assume static on error
def extract(self, html: str, url: str) -> ExtractedContent:
"""
Extract clean text content from HTML.
Args:
html: Raw HTML string
url: Source URL (for context)
Returns:
ExtractedContent: Contains title, text, metadata
"""
try:
soup = BeautifulSoup(html, 'lxml')
# Check if content is static
is_static = self.is_static_content(html)
# Extract title
title = self._extract_title(soup)
# Remove unwanted tags
for tag_name in self.REMOVE_TAGS:
for tag in soup.find_all(tag_name):
tag.decompose()
# Extract main content
text = self._extract_main_content(soup)
# Normalize whitespace
text = self._normalize_whitespace(text)
# Count words
word_count = len(text.split())
logger.info(f"Extracted {word_count} words from {url}")
return ExtractedContent(
title=title,
text=text,
is_static=is_static,
word_count=word_count,
metadata={'url': url}
)
except Exception as e:
logger.error(f"Error extracting content from {url}: {e}")
return ExtractedContent(
title=url,
text="",
is_static=False,
word_count=0,
metadata={'url': url, 'error': str(e)}
)
def _extract_title(self, soup: BeautifulSoup) -> str:
"""
Extract title from HTML.
Tries <title> tag first, then first <h1>.
Args:
soup: BeautifulSoup object
Returns:
str: Page title
"""
# Try <title> tag
title_tag = soup.find('title')
if title_tag and title_tag.string:
return title_tag.string.strip()
# Try first <h1>
h1_tag = soup.find('h1')
if h1_tag:
return h1_tag.get_text(strip=True)
# Default to empty string
return ""
def _extract_main_content(self, soup: BeautifulSoup) -> str:
"""
Extract main content from HTML.
Prioritizes semantic HTML5 elements like <article> and <main>.
Args:
soup: BeautifulSoup object
Returns:
str: Extracted text content
"""
# Try to find main content area
main_content = None
# Priority 1: <article> or <main> tags
for tag_name in self.MAIN_CONTENT_TAGS:
main_content = soup.find(tag_name)
if main_content:
logger.debug(f"Found main content in <{tag_name}> tag")
break
# Priority 2: div with role="main"
if not main_content:
main_content = soup.find('div', role='main')
if main_content:
logger.debug("Found main content in div[role='main']")
# Priority 3: Common class/id patterns
if not main_content:
for pattern in ['content', 'main', 'article', 'post']:
main_content = soup.find(['div', 'section'], class_=re.compile(pattern, re.I))
if main_content:
logger.debug(f"Found main content with class pattern '{pattern}'")
break
main_content = soup.find(['div', 'section'], id=re.compile(pattern, re.I))
if main_content:
logger.debug(f"Found main content with id pattern '{pattern}'")
break
# Fallback: use body
if not main_content:
main_content = soup.find('body')
logger.debug("Using <body> as main content (no specific content area found)")
# Extract text from content tags
if main_content:
text_parts = []
for tag in main_content.find_all(self.CONTENT_TAGS):
text = tag.get_text(strip=True)
if text:
text_parts.append(text)
return '\n'.join(text_parts)
return ""
def _normalize_whitespace(self, text: str) -> str:
"""
Normalize whitespace in text.
- Collapse multiple spaces to single space
- Reduce excessive newlines to maximum 2
- Strip leading/trailing whitespace
Args:
text: Text to normalize
Returns:
str: Normalized text
"""
# Collapse multiple spaces to single space
text = re.sub(r' +', ' ', text)
# Reduce excessive newlines to maximum 2
text = re.sub(r'\n{3,}', '\n\n', text)
# Strip leading/trailing whitespace
text = text.strip()
return text

View File

@@ -0,0 +1,302 @@
"""HTTP fetcher for web crawler."""
import requests
import time
import logging
import re
from typing import Optional, Dict
from app.core.rag.crawler.models import FetchResult
logger = logging.getLogger(__name__)
class HTTPFetcher:
"""Handle HTTP requests with retries, error handling, and response validation."""
def __init__(
self,
timeout: int = 10,
max_retries: int = 3,
user_agent: str = "KnowledgeBaseCrawler/1.0"
):
"""
Initialize HTTP fetcher.
Args:
timeout: Request timeout in seconds
max_retries: Maximum number of retry attempts
user_agent: User-Agent header value
"""
self.timeout = timeout
self.max_retries = max_retries
self.user_agent = user_agent
# Create session for connection pooling
self.session = requests.Session()
self.session.headers.update({
'User-Agent': user_agent
})
def fetch(self, url: str) -> FetchResult:
"""
Fetch a URL with retry logic and error handling.
Args:
url: URL to fetch
Returns:
FetchResult: Contains status_code, content, headers, error info
"""
last_error = None
for attempt in range(self.max_retries):
try:
# Calculate backoff delay for retries
if attempt > 0:
backoff_delay = 2 ** (attempt - 1) # 1s, 2s, 4s
logger.info(f"Retry attempt {attempt + 1}/{self.max_retries} for {url} after {backoff_delay}s")
time.sleep(backoff_delay)
# Make HTTP request
response = self.session.get(
url,
timeout=self.timeout,
allow_redirects=True
)
# Handle different status codes
if response.status_code == 429:
# Too Many Requests - backoff and retry
logger.warning(f"429 Too Many Requests for {url}, backing off")
if attempt < self.max_retries - 1:
continue
if response.status_code == 503:
# Service Unavailable - pause and retry
logger.warning(f"503 Service Unavailable for {url}")
if attempt < self.max_retries - 1:
time.sleep(5) # Longer pause for 503
continue
# Success or client error (don't retry 4xx except 429)
if 200 <= response.status_code < 300:
logger.info(f"Successfully fetched {url} (status: {response.status_code})")
# Get correctly encoded content
content = self._get_decoded_content(response)
return FetchResult(
url=url,
final_url=response.url,
status_code=response.status_code,
content=content,
headers=dict(response.headers),
error=None,
success=True
)
elif response.status_code == 404:
logger.info(f"404 Not Found: {url}")
return FetchResult(
url=url,
final_url=response.url,
status_code=response.status_code,
content=None,
headers=dict(response.headers),
error="Not Found",
success=False
)
elif 400 <= response.status_code < 500:
logger.warning(f"Client error {response.status_code} for {url}")
return FetchResult(
url=url,
final_url=response.url,
status_code=response.status_code,
content=None,
headers=dict(response.headers),
error=f"Client error: {response.status_code}",
success=False
)
elif 500 <= response.status_code < 600:
logger.error(f"Server error {response.status_code} for {url}")
last_error = f"Server error: {response.status_code}"
if attempt < self.max_retries - 1:
continue
return FetchResult(
url=url,
final_url=url,
status_code=response.status_code,
content=None,
headers={},
error=last_error,
success=False
)
except requests.exceptions.Timeout:
last_error = "Request timeout"
logger.warning(f"Timeout fetching {url} (attempt {attempt + 1}/{self.max_retries})")
if attempt >= self.max_retries - 1:
break
continue
except requests.exceptions.SSLError as e:
last_error = f"SSL/TLS error: {str(e)}"
logger.error(f"SSL/TLS error for {url}: {e}")
return FetchResult(
url=url,
final_url=url,
status_code=0,
content=None,
headers={},
error=last_error,
success=False
)
except requests.exceptions.ConnectionError as e:
last_error = f"Connection error: {str(e)}"
logger.warning(f"Connection error for {url} (attempt {attempt + 1}/{self.max_retries}): {e}")
if attempt >= self.max_retries - 1:
break
continue
except requests.exceptions.RequestException as e:
last_error = f"Request error: {str(e)}"
logger.error(f"Request error for {url}: {e}")
if attempt >= self.max_retries - 1:
break
continue
# All retries exhausted
logger.error(f"Failed to fetch {url} after {self.max_retries} attempts: {last_error}")
return FetchResult(
url=url,
final_url=url,
status_code=0,
content=None,
headers={},
error=last_error or "Unknown error",
success=False
)
def _get_decoded_content(self, response) -> str:
"""
Get correctly decoded content from response.
Handles encoding detection and fallback strategies:
1. Try encoding from HTML meta tags
2. Try response.encoding (from Content-Type header or detected)
3. Try UTF-8
4. Try common encodings (GB2312, GBK for Chinese, etc.)
5. Fall back to latin-1 with error replacement
Args:
response: requests.Response object
Returns:
str: Decoded content
"""
# Try to detect encoding from HTML meta tags
meta_encoding = self._detect_encoding_from_meta(response.content)
if meta_encoding:
try:
content = response.content.decode(meta_encoding)
logger.info(f"Successfully decoded with meta tag encoding: {meta_encoding}")
return content
except (UnicodeDecodeError, LookupError) as e:
logger.warning(f"Failed to decode with meta encoding {meta_encoding}: {e}")
# Try response.encoding (from Content-Type header or detected by requests)
if response.encoding and response.encoding.lower() != 'iso-8859-1':
# Note: requests defaults to ISO-8859-1 if no charset in Content-Type,
# so we skip it here and try UTF-8 first
try:
return response.text
except (UnicodeDecodeError, LookupError) as e:
logger.warning(f"Failed to decode with detected encoding {response.encoding}: {e}")
# Try UTF-8 first (most common)
try:
return response.content.decode('utf-8')
except UnicodeDecodeError:
logger.debug("UTF-8 decoding failed, trying other encodings")
# Try common encodings for different languages
encodings_to_try = [
'gbk', # Chinese (Simplified)
'gb2312', # Chinese (Simplified, older)
'gb18030', # Chinese (Simplified, extended)
'big5', # Chinese (Traditional)
'shift_jis', # Japanese
'euc-jp', # Japanese
'euc-kr', # Korean
'iso-8859-1', # Western European
'windows-1252', # Windows Western European
'windows-1251', # Cyrillic
]
for encoding in encodings_to_try:
try:
content = response.content.decode(encoding)
logger.info(f"Successfully decoded with {encoding}")
return content
except (UnicodeDecodeError, LookupError):
continue
# Last resort: use latin-1 with error replacement
logger.warning("All encoding attempts failed, using latin-1 with error replacement")
return response.content.decode('latin-1', errors='replace')
def _detect_encoding_from_meta(self, content: bytes) -> Optional[str]:
"""
Detect encoding from HTML meta tags.
Looks for:
- <meta charset="...">
- <meta http-equiv="Content-Type" content="...; charset=...">
Args:
content: Raw response content (bytes)
Returns:
Optional[str]: Detected encoding or None
"""
try:
# Only check first 2KB for performance
head = content[:2048]
# Try to decode as ASCII/Latin-1 to search for meta tags
try:
head_str = head.decode('ascii', errors='ignore')
except:
head_str = head.decode('latin-1', errors='ignore')
# Look for <meta charset="...">
charset_match = re.search(
r'<meta[^>]+charset=["\']?([a-zA-Z0-9_-]+)',
head_str,
re.IGNORECASE
)
if charset_match:
encoding = charset_match.group(1).lower()
logger.debug(f"Found charset in meta tag: {encoding}")
return encoding
# Look for <meta http-equiv="Content-Type" content="...; charset=...">
content_type_match = re.search(
r'<meta[^>]+http-equiv=["\']?content-type["\']?[^>]+content=["\']([^"\']+)',
head_str,
re.IGNORECASE
)
if content_type_match:
content_value = content_type_match.group(1)
charset_match = re.search(r'charset=([a-zA-Z0-9_-]+)', content_value, re.IGNORECASE)
if charset_match:
encoding = charset_match.group(1).lower()
logger.debug(f"Found charset in Content-Type meta: {encoding}")
return encoding
except Exception as e:
logger.debug(f"Error detecting encoding from meta tags: {e}")
return None

View File

@@ -0,0 +1,52 @@
"""Data models for web crawler."""
from dataclasses import dataclass, field
from datetime import datetime
from typing import Dict, Any, Optional
@dataclass
class CrawledDocument:
"""Represents a successfully processed web page with extracted content."""
url: str
title: str
content: str
content_length: int
crawl_timestamp: datetime
http_status: int
metadata: Dict[str, Any] = field(default_factory=dict)
@dataclass
class FetchResult:
"""Represents the result of an HTTP fetch operation."""
url: str
final_url: str
status_code: int
content: Optional[str]
headers: Dict[str, str]
error: Optional[str]
success: bool
@dataclass
class ExtractedContent:
"""Represents content extracted from HTML."""
title: str
text: str
is_static: bool
word_count: int
metadata: Dict[str, Any] = field(default_factory=dict)
@dataclass
class CrawlSummary:
"""Represents statistics from a completed crawl."""
total_pages_processed: int
total_errors: int
total_skipped: int
total_urls_discovered: int
start_time: datetime
end_time: datetime
duration_seconds: float
error_breakdown: Dict[str, int] = field(default_factory=dict)

View File

@@ -0,0 +1,57 @@
"""Rate limiter for web crawler."""
import time
import logging
logger = logging.getLogger(__name__)
class RateLimiter:
"""Enforce delays between requests to be polite to servers."""
def __init__(self, delay_seconds: float = 1.0):
"""
Initialize rate limiter.
Args:
delay_seconds: Minimum delay between requests
"""
self.delay_seconds = delay_seconds
self.last_request_time = 0.0
self.max_delay = 60.0 # Cap maximum delay at 60 seconds
def wait(self):
"""
Block until enough time has passed since last request.
Respects the configured delay.
"""
current_time = time.time()
elapsed = current_time - self.last_request_time
if elapsed < self.delay_seconds:
sleep_time = self.delay_seconds - elapsed
logger.debug(f"Rate limiting: sleeping for {sleep_time:.2f} seconds")
time.sleep(sleep_time)
self.last_request_time = time.time()
def set_delay(self, delay_seconds: float):
"""
Update the delay (useful for respecting Crawl-delay from robots.txt).
Args:
delay_seconds: New delay in seconds
"""
self.delay_seconds = min(delay_seconds, self.max_delay)
logger.info(f"Rate limiter delay updated to {self.delay_seconds} seconds")
def backoff(self, multiplier: float = 2.0):
"""
Increase delay exponentially for backoff scenarios (429, 503 responses).
Args:
multiplier: Factor to multiply current delay by
"""
old_delay = self.delay_seconds
self.delay_seconds = min(self.delay_seconds * multiplier, self.max_delay)
logger.warning(f"Rate limiter backing off: {old_delay:.2f}s -> {self.delay_seconds:.2f}s")

View File

@@ -0,0 +1,118 @@
"""Robots.txt parser for web crawler."""
from urllib.robotparser import RobotFileParser
from urllib.parse import urlparse, urljoin
from typing import Optional
import logging
logger = logging.getLogger(__name__)
class RobotsParser:
"""Parse and check robots.txt compliance for URLs."""
def __init__(self, user_agent: str, timeout: int = 10):
"""
Initialize robots.txt parser.
Args:
user_agent: User agent string to check permissions for
timeout: Timeout for fetching robots.txt
"""
self.user_agent = user_agent
self.timeout = timeout
self._parsers = {} # Cache parsers by domain
def _get_robots_url(self, url: str) -> str:
"""
Get the robots.txt URL for a given URL.
Args:
url: URL to get robots.txt for
Returns:
str: robots.txt URL
"""
parsed = urlparse(url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
return robots_url
def _get_parser(self, url: str) -> RobotFileParser:
"""
Get or create a RobotFileParser for the domain.
Args:
url: URL to get parser for
Returns:
RobotFileParser: Parser for the domain
"""
robots_url = self._get_robots_url(url)
# Return cached parser if available
if robots_url in self._parsers:
return self._parsers[robots_url]
# Create new parser
parser = RobotFileParser()
parser.set_url(robots_url)
try:
# Fetch and parse robots.txt
parser.read()
logger.info(f"Successfully fetched robots.txt from {robots_url}")
except Exception as e:
# If robots.txt cannot be fetched, assume all URLs are allowed
logger.warning(f"Could not fetch robots.txt from {robots_url}: {e}. Assuming all URLs allowed.")
# Create a permissive parser
parser = RobotFileParser()
parser.parse([]) # Empty robots.txt allows everything
# Cache the parser
self._parsers[robots_url] = parser
return parser
def can_fetch(self, url: str) -> bool:
"""
Check if the given URL can be fetched according to robots.txt.
Args:
url: URL to check
Returns:
bool: True if allowed, False if disallowed
"""
try:
parser = self._get_parser(url)
allowed = parser.can_fetch(self.user_agent, url)
if not allowed:
logger.info(f"URL disallowed by robots.txt: {url}")
return allowed
except Exception as e:
logger.error(f"Error checking robots.txt for {url}: {e}")
# On error, assume allowed
return True
def get_crawl_delay(self, url: str) -> Optional[float]:
"""
Get the Crawl-delay directive from robots.txt if present.
Args:
url: URL to get crawl delay for
Returns:
Optional[float]: Delay in seconds, or None if not specified
"""
try:
parser = self._get_parser(url)
delay = parser.crawl_delay(self.user_agent)
if delay is not None:
logger.info(f"Crawl-delay from robots.txt: {delay} seconds")
return delay
except Exception as e:
logger.error(f"Error getting crawl delay for {url}: {e}")
return None

View File

@@ -0,0 +1,171 @@
"""URL normalization and validation for web crawler."""
from typing import Optional, List
from urllib.parse import urlparse, urlunparse, parse_qs, urlencode, urljoin
from bs4 import BeautifulSoup
class URLNormalizer:
"""Normalize and validate URLs for deduplication and domain checking."""
# Common tracking parameters to remove
TRACKING_PARAMS = {
'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
'fbclid', 'gclid', 'msclkid', '_ga', 'mc_cid', 'mc_eid'
}
def __init__(self, base_domain: str):
"""
Initialize URL normalizer with base domain.
Args:
base_domain: The domain to use for same-domain checks
"""
parsed = urlparse(base_domain)
self.base_domain = parsed.netloc.lower() # example.com:8000
self.base_scheme = parsed.scheme or 'https' # https
def normalize(self, url: str) -> Optional[str]:
"""
Normalize a URL for deduplication.
Normalization rules:
1. Convert domain to lowercase
2. Remove fragments (#section)
3. Remove default ports (80 for http, 443 for https)
4. Remove trailing slashes (except for root)
5. Sort query parameters alphabetically
6. Remove common tracking parameters
Args:
url: URL to normalize
Returns:
Optional[str]: Normalized URL, or None if invalid
"""
try:
parsed = urlparse(url)
# Validate scheme
if parsed.scheme not in ('http', 'https'):
return None
# Normalize domain to lowercase
netloc = parsed.netloc.lower()
# Remove default ports
if ':' in netloc:
host, port = netloc.rsplit(':', 1)
if (parsed.scheme == 'http' and port == '80') or \
(parsed.scheme == 'https' and port == '443'):
netloc = host
# Normalize path
path = parsed.path
# Remove trailing slash except for root
if path != '/' and path.endswith('/'):
path = path.rstrip('/')
# Ensure path starts with /
if not path:
path = '/'
# Process query parameters
query = ''
if parsed.query:
# Parse query parameters
params = parse_qs(parsed.query, keep_blank_values=True)
# Remove tracking parameters
filtered_params = {
k: v for k, v in params.items()
if k not in self.TRACKING_PARAMS
}
# Sort parameters alphabetically
if filtered_params:
sorted_params = sorted(filtered_params.items())
query = urlencode(sorted_params, doseq=True)
# Reconstruct URL without fragment
normalized = urlunparse((
parsed.scheme,
netloc,
path,
parsed.params,
query,
'' # Remove fragment
))
return normalized
except Exception:
return None
def is_same_domain(self, url: str) -> bool:
"""
Check if URL belongs to the same domain as base_domain.
Args:
url: URL to check
Returns:
bool: True if same domain, False otherwise
"""
try:
parsed = urlparse(url)
domain = parsed.netloc.lower()
# Remove port if present
if ':' in domain:
domain = domain.split(':')[0]
# Check if domains match
return domain == self.base_domain or domain == self.base_domain.split(':')[0]
except Exception:
return False
def extract_links(self, html: str, base_url: str) -> List[str]:
"""
Extract and normalize all links from HTML.
Args:
html: HTML content
base_url: Base URL for resolving relative links
Returns:
List[str]: List of normalized absolute URLs
"""
links = []
try:
soup = BeautifulSoup(html, 'lxml')
# Find all anchor tags
for anchor in soup.find_all('a', href=True):
href = anchor['href']
# Skip empty hrefs
if not href or href.strip() == '':
continue
# Skip javascript: and mailto: links
if href.startswith(('javascript:', 'mailto:', 'tel:')):
continue
normalized_url = None
# Check if href starts with http/https (absolute URL)
if href.startswith(('http://', 'https://')):
if self.is_same_domain(href):
normalized_url = self.normalize(href)
else:
# Convert relative URL to absolute
absolute_url = urljoin(base_url, href)
# Normalize the URL
normalized_url = self.normalize(absolute_url)
if normalized_url:
links.append(normalized_url)
except Exception:
pass
return links

View File

@@ -0,0 +1,215 @@
"""Main web crawler orchestrator."""
from collections import deque
from datetime import datetime
from typing import Iterator, Optional, List, Set
from urllib.parse import urlparse
import logging
from app.core.rag.crawler.url_normalizer import URLNormalizer
from app.core.rag.crawler.robots_parser import RobotsParser
from app.core.rag.crawler.rate_limiter import RateLimiter
from app.core.rag.crawler.http_fetcher import HTTPFetcher
from app.core.rag.crawler.content_extractor import ContentExtractor
from app.core.rag.crawler.models import CrawledDocument, CrawlSummary
logger = logging.getLogger(__name__)
class WebCrawler:
"""Main orchestrator for web crawling."""
def __init__(
self,
entry_url: str,
max_pages: int = 200,
delay_seconds: float = 1.0,
timeout_seconds: int = 10,
user_agent: str = "KnowledgeBaseCrawler/1.0",
include_patterns: Optional[List[str]] = None,
exclude_patterns: Optional[List[str]] = None,
content_extractor: Optional[ContentExtractor] = None
):
"""
Initialize the web crawler.
Args:
entry_url: Starting URL for the crawl
max_pages: Maximum number of pages to crawl (default: 200)
delay_seconds: Delay between requests in seconds (default: 1.0)
timeout_seconds: HTTP request timeout (default: 10)
user_agent: User-Agent header string
include_patterns: List of regex patterns for URLs to include
exclude_patterns: List of regex patterns for URLs to exclude
content_extractor: Custom content extractor (optional)
"""
# Validate entry URL
parsed = urlparse(entry_url)
if not parsed.scheme or not parsed.netloc:
raise ValueError(f"Invalid entry URL: {entry_url}")
self.entry_url = entry_url
self.max_pages = max_pages
self.user_agent = user_agent
# Extract domain from entry URL
self.domain = parsed.netloc
# Initialize components
self.url_normalizer = URLNormalizer(entry_url)
self.robots_parser = RobotsParser(user_agent, timeout_seconds)
self.rate_limiter = RateLimiter(delay_seconds)
self.http_fetcher = HTTPFetcher(timeout_seconds, max_retries=3, user_agent=user_agent)
self.content_extractor = content_extractor or ContentExtractor()
# State management
self.url_queue: deque = deque()
self.visited_urls: Set[str] = set()
self.pages_processed = 0
# Statistics
self.stats = {
'success': 0,
'errors': 0,
'skipped': 0,
'urls_discovered': 0,
'error_breakdown': {}
}
self.start_time: Optional[datetime] = None
self.end_time: Optional[datetime] = None
def crawl(self) -> Iterator[CrawledDocument]:
"""
Execute the crawl and yield documents as they are processed.
Yields:
CrawledDocument: Structured document with extracted content
"""
logger.info(f"Starting crawl from {self.entry_url} (max_pages: {self.max_pages})")
self.start_time = datetime.now()
# Add entry URL to queue
normalized_entry = self.url_normalizer.normalize(self.entry_url)
if normalized_entry:
self.url_queue.append(normalized_entry)
self.stats['urls_discovered'] += 1
# Check robots.txt and update rate limiter if needed
crawl_delay = self.robots_parser.get_crawl_delay(self.entry_url)
if crawl_delay:
self.rate_limiter.set_delay(crawl_delay)
# Main crawl loop
while self.url_queue and self.pages_processed < self.max_pages:
url = self.url_queue.popleft()
# Skip if already visited
if url in self.visited_urls:
continue
# Mark as visited
self.visited_urls.add(url)
# Check robots.txt permission
if not self.robots_parser.can_fetch(url):
logger.info(f"Skipping {url} (disallowed by robots.txt)")
self.stats['skipped'] += 1
continue
# Apply rate limiting
self.rate_limiter.wait()
# Fetch URL
logger.info(f"Fetching {url} ({self.pages_processed + 1}/{self.max_pages})")
fetch_result = self.http_fetcher.fetch(url)
# Handle fetch errors
if not fetch_result.success:
self._record_error(fetch_result.error or "Unknown error")
continue
# Check Content-Type
content_type = fetch_result.headers.get('Content-Type', '').lower()
if not any(substring in content_type for substring in ['text/html', 'application/xhtml+xml']):
logger.warning(f"Skipping {url} (Content-Type: {content_type})")
self.stats['skipped'] += 1
continue
# Extract content
try:
extracted = self.content_extractor.extract(fetch_result.content, url)
# Check if static content
if not extracted.is_static:
logger.warning(f"Skipping {url} (JavaScript-rendered content)")
self.stats['skipped'] += 1
continue
# Create document
document = CrawledDocument(
url=url,
title=extracted.title,
content=extracted.text,
content_length=len(extracted.text),
crawl_timestamp=datetime.now(),
http_status=fetch_result.status_code,
metadata={
'word_count': extracted.word_count,
'final_url': fetch_result.final_url
}
)
# Update statistics
self.pages_processed += 1
self.stats['success'] += 1
# Extract and queue links
links = self.url_normalizer.extract_links(fetch_result.content, url)
for link in links:
if link not in self.visited_urls and self.url_normalizer.is_same_domain(link):
if link not in self.url_queue:
self.url_queue.append(link)
self.stats['urls_discovered'] += 1
# Yield document
yield document
except Exception as e:
logger.error(f"Error processing {url}: {e}")
self._record_error(f"Processing error: {str(e)}")
continue
self.end_time = datetime.now()
logger.info(f"Crawl completed. Processed {self.pages_processed} pages.")
def get_summary(self) -> CrawlSummary:
"""
Get summary statistics after crawl completion.
Returns:
CrawlSummary: Statistics including success/error/skip counts
"""
if not self.start_time:
self.start_time = datetime.now()
if not self.end_time:
self.end_time = datetime.now()
duration = (self.end_time - self.start_time).total_seconds()
return CrawlSummary(
total_pages_processed=self.stats['success'],
total_errors=self.stats['errors'],
total_skipped=self.stats['skipped'],
total_urls_discovered=self.stats['urls_discovered'],
start_time=self.start_time,
end_time=self.end_time,
duration_seconds=duration,
error_breakdown=self.stats['error_breakdown']
)
def _record_error(self, error: str):
"""Record an error in statistics."""
self.stats['errors'] += 1
error_type = error.split(':')[0] if ':' in error else error
self.stats['error_breakdown'][error_type] = \
self.stats['error_breakdown'].get(error_type, 0) + 1