# Requirements - Install necessary packages if not installed
# py -3.10 -m pip install requests beautifulsoup4 langchain-community bs4 langchain-text-splitters langchain-chroma langchain-ollama
# --1: Import necessary libraries
import os
import bs4
import requests # Needed for session handling
import hashlib # To track URL changes
import shutil # For clearing ChromaDB
import time # To ensure cleanup before deletion
import platform
import psutil
import subprocess
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document # ✅ Import Document wrapper
from langchain_ollama import OllamaLLM # ✅ Import Ollama LLM
from langchain_community.vectorstores import Chroma # ✅ Import ChromaDB
# ✅ os.name
print("--")
print("OS Name:")
print(os.name)
print("--")
# ✅ platform
print("Platform:")
print(platform.platform())
print("--")
# ✅ psutil
print("CPU Count:")
print(psutil.cpu_count())
print("--")
# ✅ psutil
mem = psutil.virtual_memory().total/1024**3
print("Physical memory = %0.2f GiB" % mem)
print("--")
mem2 = psutil.virtual_memory()
print("Physical memory details:")
print(mem2)
print("--")
#print("PID:")
#for p in psutil.process_iter(): print(p)
# ✅ Step 1: Stop all running Ollama processes
print("\n🔍 Searching for 'ollama' processes running...\n")
for p in psutil.process_iter(['pid', 'name']):
if "ollama" in p.info['name'].lower(): # Check if "ollama" is in the process name
print(f"🛑 Stopping: {p.info['name']} (PID: {p.info['pid']})")
psutil.Process(p.info['pid']).terminate() # Terminate the process
# ✅ Step 2: Wait a moment to ensure processes are terminated
time.sleep(3)
# ✅ Step 3: Start a new Ollama process
print("\n🚀 Starting a new Ollama process...\n")
subprocess.Popen(["ollama", "serve"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print("\n✅ Ollama has been restarted successfully!\n")
print("🔍 Searching for 'ollama app' process...\n")
for p in psutil.process_iter(['pid', 'name']):
if "ollama" in p.info['name'].lower(): # Check if "ollama" is in process name
print(f"✅ Found: {p.info['name']} (PID: {p.info['pid']})")
# ✅ Set the USER_AGENT environment variable and also in requests
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
os.environ["USER_AGENT"] = USER_AGENT # Ensures USER_AGENT is set globally
# ✅ Persistent database path for Chroma
chroma_db_path = "chroma_db"
url_hash_file = "last_url.txt"
# ✅ Create a requests session and set the User-Agent
session = requests.Session()
session.headers.update({"User-Agent": USER_AGENT})
# ✅ Initialize ChromaDB reference (global)
vectorstore = None
article_title = "Unknown Title" # Default title
def clear_chroma_db():
""" Properly closes and deletes the ChromaDB before reloading """
global vectorstore
if vectorstore:
print("\n🔄 Closing ChromaDB connection before deleting the database...")
vectorstore = None # Remove reference to ensure closure
# ✅ Ensure ChromaDB releases all resources before deletion
time.sleep(2) # Wait for resources to be fully released
# ✅ Delete ChromaDB folder
if os.path.exists(chroma_db_path):
try:
shutil.rmtree(chroma_db_path)
print("\n✅ ChromaDB successfully cleared.")
except PermissionError:
print("\n🚨 ChromaDB is still in use. Retry after closing any processes using it.")
def set_new_url():
""" Function to handle URL input and update the database if needed """
global vectorstore, url, article_title
while True:
url = input("\n🌍 Enter the webpage URL: ").strip()
if url:
break
# ✅ Compute hash of the URL to detect changes
url_hash = hashlib.md5(url.encode()).hexdigest()
# ✅ Check if the URL has changed since the last run
if os.path.exists(url_hash_file):
with open(url_hash_file, "r") as f:
last_url_hash = f.read().strip()
if last_url_hash != url_hash:
print("\n🔄 URL changed! Clearing ChromaDB to rebuild with new data...")
clear_chroma_db()
else:
print("\n🆕 First-time run. Creating new ChromaDB.")
# ✅ Save the new URL hash
with open(url_hash_file, "w") as f:
f.write(url_hash)
# ✅ Load the webpage content
print("\n🔗 Scraping content from:", url)
page = session.get(url)
soup = bs4.BeautifulSoup(page.content, "html.parser")
# ✅ Extract the article title from <div class="post-title">
title_div = soup.find("h1", class_="post-title")
article_title = title_div.get_text(strip=True) if title_div else "Unknown Title"
# ✅ Extract text from <div class="post-content">
post_content_div = soup.find("div", class_="post-content")
if post_content_div:
raw_text = post_content_div.get_text(strip=True)
else:
print("\n🚨 No content found inside <div class='post-content'>!")
exit()
# ✅ Wrap the extracted text into LangChain Document format
docs = [Document(page_content=raw_text)] # ✅ Fix applied here
# ✅ Debugging: Print extracted text preview
print(f"\n📜 Extracted Title: {article_title}")
print(raw_text[:500] + "...\n" if raw_text else "No text extracted!")
# --2: Split the text into smaller chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1200, chunk_overlap=100, add_start_index=True
)
all_splits = text_splitter.split_documents(docs) # ✅ Now works properly
# ✅ Debugging: Print number of text chunks created
print(f"\n🔹 Number of text chunks created: {len(all_splits)}")
# If no chunks are created, stop execution
if not all_splits:
print("\n🚨 No text chunks were created! The document might be empty or incorrectly processed.")
exit()
# --3: Create embeddings and vector store
from langchain_ollama import OllamaEmbeddings
# 🔹 ✅ Ensure Ollama is running and use a reliable model
try:
local_embeddings = OllamaEmbeddings(model="all-minilm") # Ensure the model exists
except Exception as e:
print("\n🚨 Error loading Ollama embeddings. Ensure Ollama is running and the model is installed.")
print("Run: ollama serve and ollama pull all-minilm")
print("Error Details:", e)
exit()
# ✅ Debugging: Try embedding a sample chunk
sample_embeddings = local_embeddings.embed_documents([all_splits[0].page_content]) if all_splits else []
if not sample_embeddings:
print("\n🚨 Embedding model failed! No embeddings were generated.")
exit()
# ✅ Rebuild ChromaDB with new data if URL changed
vectorstore = Chroma.from_documents(documents=all_splits, embedding=local_embeddings, persist_directory=chroma_db_path)
print(f"\n✅ Webpage '{article_title}' has been processed! You can now ask questions.")
# ✅ Set the initial URL and build the database
set_new_url()
# ✅ Start the chatbot loop
llm = OllamaLLM(model="llama3.2:1b") # ✅ Use a valid Ollama model
print("\n💬 You can now ask questions! Type 'change url' to scrape a new page or 'exit app' to quit.")
while True:
question = input(f"\n🔎 [{article_title}] Enter your question: ").strip()
if question.lower() == "exit app":
print("\n👋 Exiting the app. Goodbye!")
break # Exit loop when user types "exit app"
if question.lower() == "change url":
print("\n🔄 Switching to a new URL...\n")
set_new_url()
continue # Restart loop with a new URL
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retrieved_docs = retriever.invoke(question)
# ✅ Combine the retrieved document content into a context string
context = ' '.join([doc.page_content for doc in retrieved_docs])
response = llm.invoke(f"""Answer the question according to the context given very briefly:
Question: {question}
Context: {context}
""")
# ✅ Print the answer
print("\n" + "="*80)
print(f"📌 Question: '{question}' from: '{article_title}'")
print("="*80)
print(f"📌 Answer: {response}")
#print(response)
print("="*80 + "\n")