# -------------------------------------
# Program 8: Focused Crawler for Local Search
# -------------------------------------

# Install required packages:
# pip install requests beautifulsoup4

import requests  # For sending web requests
from bs4 import BeautifulSoup  # For parsing HTML
from urllib.parse import urljoin, urlparse  # For URL handling
from collections import deque  # For efficient queue
import time  # For adding delay between requests

# ------------------ Parameters ------------------
MAX_PAGES = 15  # Max number of pages to crawl
CRAWL_DELAY = 1  # Delay between requests (in seconds)
RELEVANCE_THRESHOLD = 2  # Minimum keyword hits for a page to be considered relevant

# ------------------ Focused Crawler Function ------------------
def focused_crawler(seed_url, keywords):
    visited = set()  # Track visited URLs
    queue = deque([seed_url])  # Queue for URLs to crawl
    pages_crawled = 0  # Counter
    relevant_pages = []  # Store matched pages

    # Get domain to restrict crawling
    parsed_seed = urlparse(seed_url)
    base_domain = parsed_seed.netloc

    print(f"\n[INFO] Focused crawl started at: {seed_url}")
    print(f"[INFO] Keywords: {keywords}\n")

    # Start crawling
    while queue and pages_crawled < MAX_PAGES:
        current_url = queue.popleft()

        if current_url in visited:
            continue  # Skip if already visited

        try:
            # Fetch the webpage
            response = requests.get(current_url, timeout=5, headers={"User-Agent": "Mozilla/5.0"})

            # Skip non-HTML pages
            if response.status_code != 200 or 'text/html' not in response.headers.get('Content-Type', ''):
                continue

            # Parse HTML content
            soup = BeautifulSoup(response.text, 'html.parser')
            text = soup.get_text(separator=' ', strip=True).lower()

            # Count keyword hits in text
            match_score = sum(text.count(k.lower()) for k in keywords)

            # If score is high enough, consider the page relevant
            if match_score >= RELEVANCE_THRESHOLD:
                print(f"[MATCH] Relevant page found ({match_score} hits): {current_url}")
                relevant_pages.append((current_url, match_score))
                pages_crawled += 1

            # Find all internal links on page
            for link in soup.find_all('a', href=True):
                href = link['href']
                abs_url = urljoin(current_url, href)  # Convert to full URL
                parsed_url = urlparse(abs_url)

                # Only follow links within the same domain and not visited
                if parsed_url.netloc == base_domain and abs_url not in visited:
                    queue.append(abs_url)

            # Mark page as visited
            visited.add(current_url)
            time.sleep(CRAWL_DELAY)

        except Exception as e:
            print(f"[ERROR] Skipping {current_url}: {str(e)}")

    print(f"\n[INFO] Crawling complete. Relevant pages found: {len(relevant_pages)}")
    return relevant_pages

# ------------------ Program Execution ------------------
if __name__ == "__main__":
    # Ask user for seed URL and keywords
    seed = input("Enter the seed URL (e.g., https://example.com): ").strip()
    raw_keywords = input("Enter keywords to search (comma-separated): ").strip()
    keyword_list = [kw.strip() for kw in raw_keywords.split(',') if kw.strip()]

    # Run crawler
    results = focused_crawler(seed, keyword_list)

    # Show result
    print("\n--- Relevant Pages and Scores ---")
    for url, score in results:
        print(f"{url} (Keyword hits: {score})")


input 
#https://en.wikipedia.org/wiki/Natural_language_processing
#machine learning, NLP, artificial intelligence