# ------------------------------------- # Program 8: Focused Crawler for Local Search # ------------------------------------- # Install required packages: # pip install requests beautifulsoup4 import requests # For sending web requests from bs4 import BeautifulSoup # For parsing HTML from urllib.parse import urljoin, urlparse # For URL handling from collections import deque # For efficient queue import time # For adding delay between requests # ------------------ Parameters ------------------ MAX_PAGES = 15 # Max number of pages to crawl CRAWL_DELAY = 1 # Delay between requests (in seconds) RELEVANCE_THRESHOLD = 2 # Minimum keyword hits for a page to be considered relevant # ------------------ Focused Crawler Function ------------------ def focused_crawler(seed_url, keywords): visited = set() # Track visited URLs queue = deque([seed_url]) # Queue for URLs to crawl pages_crawled = 0 # Counter relevant_pages = [] # Store matched pages # Get domain to restrict crawling parsed_seed = urlparse(seed_url) base_domain = parsed_seed.netloc print(f"\n[INFO] Focused crawl started at: {seed_url}") print(f"[INFO] Keywords: {keywords}\n") # Start crawling while queue and pages_crawled < MAX_PAGES: current_url = queue.popleft() if current_url in visited: continue # Skip if already visited try: # Fetch the webpage response = requests.get(current_url, timeout=5, headers={"User-Agent": "Mozilla/5.0"}) # Skip non-HTML pages if response.status_code != 200 or 'text/html' not in response.headers.get('Content-Type', ''): continue # Parse HTML content soup = BeautifulSoup(response.text, 'html.parser') text = soup.get_text(separator=' ', strip=True).lower() # Count keyword hits in text match_score = sum(text.count(k.lower()) for k in keywords) # If score is high enough, consider the page relevant if match_score >= RELEVANCE_THRESHOLD: print(f"[MATCH] Relevant page found ({match_score} hits): {current_url}") relevant_pages.append((current_url, match_score)) pages_crawled += 1 # Find all internal links on page for link in soup.find_all('a', href=True): href = link['href'] abs_url = urljoin(current_url, href) # Convert to full URL parsed_url = urlparse(abs_url) # Only follow links within the same domain and not visited if parsed_url.netloc == base_domain and abs_url not in visited: queue.append(abs_url) # Mark page as visited visited.add(current_url) time.sleep(CRAWL_DELAY) except Exception as e: print(f"[ERROR] Skipping {current_url}: {str(e)}") print(f"\n[INFO] Crawling complete. Relevant pages found: {len(relevant_pages)}") return relevant_pages # ------------------ Program Execution ------------------ if __name__ == "__main__": # Ask user for seed URL and keywords seed = input("Enter the seed URL (e.g., https://example.com): ").strip() raw_keywords = input("Enter keywords to search (comma-separated): ").strip() keyword_list = [kw.strip() for kw in raw_keywords.split(',') if kw.strip()] # Run crawler results = focused_crawler(seed, keyword_list) # Show result print("\n--- Relevant Pages and Scores ---") for url, score in results: print(f"{url} (Keyword hits: {score})") input #https://en.wikipedia.org/wiki/Natural_language_processing #machine learning, NLP, artificial intelligence