# --------------------------------------------- # Program 5: Text Mining & Meta Info Extraction # --------------------------------------------- # Install required libraries: # pip install requests beautifulsoup4 nltk import requests # To fetch webpage HTML from bs4 import BeautifulSoup # For parsing HTML content import nltk import re # For text cleaning using regex import string from nltk.corpus import stopwords # Common words like 'the', 'and' from nltk.stem import PorterStemmer # To stem words like "running" → "run" # Download stopwords (run once) nltk.download('stopwords') # ------------------ Function to Extract Meta Info & Text ------------------ def extract_webpage_data(url): headers = { "User-Agent": "Mozilla/5.0" # Pretend like a browser } response = requests.get(url, headers=headers) if response.status_code != 200: print("Failed to retrieve page.") return None soup = BeautifulSoup(response.text, 'html.parser') # Extract meta tags from head title = soup.title.string.strip() if soup.title else "No title" description = soup.find('meta', attrs={'name': 'description'}) keywords = soup.find('meta', attrs={'name': 'keywords'}) author = soup.find('meta', attrs={'name': 'author'}) meta_info = { "Title": title, "Description": description['content'].strip() if description else "No description", "Keywords": keywords['content'].strip() if keywords else "No keywords", "Author": author['content'].strip() if author else "No author" } # Remove scripts and styles from the page body for script_or_style in soup(["script", "style"]): script_or_style.decompose() # Get plain text from HTML raw_text = soup.get_text() lines = [line.strip() for line in raw_text.splitlines()] full_text = ' '.join(line for line in lines if line) return meta_info, full_text # ------------------ Text Preprocessing Function ------------------ def preprocess_text(text): text = text.lower() # Convert to lowercase text = re.sub(r'http\S+', '', text) # Remove URLs text = re.sub(r'[^a-z\s]', '', text) # Remove punctuation and numbers tokens = text.split() # Tokenize text stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() # Remove stopwords and apply stemming cleaned_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words] return ' '.join(cleaned_tokens) # ------------------ Main Execution ------------------ if __name__ == "__main__": url = input("Enter a webpage URL: ").strip().replace("Enter a webpage URL: ", "") result = extract_webpage_data(url) if result: meta, text = result print("\n--- Meta Information ---") for key, value in meta.items(): print(f"{key}: {value}") print("\n--- Raw Text Snippet (First 500 chars) ---") print(text[:500] + "...\n") cleaned_text = preprocess_text(text) print("--- Cleaned & Preprocessed Text Snippet (First 500 chars) ---") print(cleaned_text[:500] + "...") # Input Wikipedia page or any other article. #https://en.wikipedia.org/wiki/Japan