# ---------------------------------------------
# Program 5: Text Mining & Meta Info Extraction
# ---------------------------------------------

# Install required libraries:
# pip install requests beautifulsoup4 nltk

import requests  # To fetch webpage HTML
from bs4 import BeautifulSoup  # For parsing HTML content
import nltk
import re  # For text cleaning using regex
import string
from nltk.corpus import stopwords  # Common words like 'the', 'and'
from nltk.stem import PorterStemmer  # To stem words like "running" → "run"

# Download stopwords (run once)
nltk.download('stopwords')

# ------------------ Function to Extract Meta Info & Text ------------------
def extract_webpage_data(url):
    headers = {
        "User-Agent": "Mozilla/5.0"  # Pretend like a browser
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print("Failed to retrieve page.")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract meta tags from head
    title = soup.title.string.strip() if soup.title else "No title"
    description = soup.find('meta', attrs={'name': 'description'})
    keywords = soup.find('meta', attrs={'name': 'keywords'})
    author = soup.find('meta', attrs={'name': 'author'})

    meta_info = {
        "Title": title,
        "Description": description['content'].strip() if description else "No description",
        "Keywords": keywords['content'].strip() if keywords else "No keywords",
        "Author": author['content'].strip() if author else "No author"
    }

    # Remove scripts and styles from the page body
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()

    # Get plain text from HTML
    raw_text = soup.get_text()
    lines = [line.strip() for line in raw_text.splitlines()]
    full_text = ' '.join(line for line in lines if line)

    return meta_info, full_text

# ------------------ Text Preprocessing Function ------------------
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers

    tokens = text.split()  # Tokenize text
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    # Remove stopwords and apply stemming
    cleaned_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(cleaned_tokens)

# ------------------ Main Execution ------------------
if __name__ == "__main__":
    url = input("Enter a webpage URL: ").strip().replace("Enter a webpage URL: ", "")
    result = extract_webpage_data(url)

    if result:
        meta, text = result

        print("\n--- Meta Information ---")
        for key, value in meta.items():
            print(f"{key}: {value}")

        print("\n--- Raw Text Snippet (First 500 chars) ---")
        print(text[:500] + "...\n")

        cleaned_text = preprocess_text(text)

        print("--- Cleaned & Preprocessed Text Snippet (First 500 chars) ---")
        print(cleaned_text[:500] + "...")

# Input Wikipedia page or any other article.
#https://en.wikipedia.org/wiki/Japan