diff --git a/SearchEngine/__pycache__/index.cpython-312.pyc b/SearchEngine/__pycache__/index.cpython-312.pyc new file mode 100644 index 0000000..bdf7c8f Binary files /dev/null and b/SearchEngine/__pycache__/index.cpython-312.pyc differ diff --git a/SearchEngine/__pycache__/search.cpython-312.pyc b/SearchEngine/__pycache__/search.cpython-312.pyc new file mode 100644 index 0000000..12f68c5 Binary files /dev/null and b/SearchEngine/__pycache__/search.cpython-312.pyc differ diff --git a/SearchEngine/__pycache__/stack.cpython-312.pyc b/SearchEngine/__pycache__/stack.cpython-312.pyc new file mode 100644 index 0000000..75f8b5c Binary files /dev/null and b/SearchEngine/__pycache__/stack.cpython-312.pyc differ diff --git a/SearchEngine/documents/doc2.txt b/SearchEngine/documents/doc2.txt index 209cd86..669b2bc 100644 --- a/SearchEngine/documents/doc2.txt +++ b/SearchEngine/documents/doc2.txt @@ -1 +1,4 @@ -Data structures are a fundamental concept in computer science. They are used to organize and store data efficiently. Examples include arrays, linked lists, stacks, and queues. Understanding these structures is key to writing effective algorithms. \ No newline at end of file +Data structures are a fundamental concept in computer science. +They are used to organize and store data efficiently. +Examples include arrays, linked lists, stacks, and queues. +Understanding these structures is key to writing effective algorithms. \ No newline at end of file diff --git a/SearchEngine/index.py b/SearchEngine/index.py index 145f6ae..8abb6fc 100644 --- a/SearchEngine/index.py +++ b/SearchEngine/index.py @@ -2,40 +2,53 @@ import string class InvertedIndex: - """Inverted Index for storing word → doc → frequency.""" + """A simple inverted index: word -> document -> frequency.""" def __init__(self): + # Dictionary to store index + # Example: { "word": {"doc1": 2, "doc2": 1} } self.index = {} def add_doc(self, doc, text): - text = text.lower().translate(str.maketrans('', '', string.punctuation)) - words = [w.strip() for w in text.split() if w.strip()] - for w in words: - if w not in self.index: - self.index[w] = {} - self.index[w][doc] = self.index[w].get(doc, 0) + 1 + """Add a document to the index.""" + # Convert text to lowercase and remove punctuation + clean_text = text.lower().translate(str.maketrans('', '', string.punctuation)) + + # Split text into words + words = clean_text.split() + + for word in words: + if word not in self.index: + self.index[word] = {} + # Count how many times a word appears in a document + self.index[word][doc] = self.index[word].get(doc, 0) + 1 + + def _clean_query(self, query): + """Helper function to clean search queries.""" + query = query.lower().translate(str.maketrans('', '', string.punctuation)) + return query.split() def search(self, query): + """Search for documents that contain all words in the query.""" q_words = self._clean_query(query) if not q_words: return [] + # Start with documents containing the first word if q_words[0] not in self.index: return [] results = set(self.index[q_words[0]].keys()) - for w in q_words[1:]: - if w not in self.index: + # Keep only docs that contain all other words + for word in q_words[1:]: + if word not in self.index: return [] - results &= set(self.index[w].keys()) + results = results & set(self.index[word].keys()) + # Rank results by score (sum of word frequencies) ranked = [] for doc in results: - score = sum(self.index[w].get(doc, 0) for w in q_words) + score = sum(self.index[word].get(doc, 0) for word in q_words) ranked.append({"doc": doc, "score": score}) return sorted(ranked, key=lambda x: x["score"], reverse=True) - - def _clean_query(self, query): - query = query.lower().translate(str.maketrans('', '', string.punctuation)) - return [w.strip() for w in query.split() if w.strip()] diff --git a/SearchEngine/search.py b/SearchEngine/search.py index e52fe29..981c6d8 100644 --- a/SearchEngine/search.py +++ b/SearchEngine/search.py @@ -3,87 +3,99 @@ from stack import Stack from index import InvertedIndex +# Path to the folder that has all text files +PATH = "./documents" # First we go to base dir + class SearchSim: - """Search engine simulation using Inverted Index + Stack (for history).""" + """A simple search engine simulation.""" - def __init__(self, path="./"): - self.index = InvertedIndex() - self.history = Stack() - self.path = path - self.results = [] - self._load() + def __init__(self, path=PATH): + self.index = InvertedIndex() # for word search + self.history = Stack() # to store search history + self.path = path # folder path + self.results = [] # last search results + self._load() # load documents into index def _load(self): - print("Building index...") + """Load all text files and build the index.""" + print("Loading index...") try: files = [f for f in os.listdir(self.path) if f.endswith(".txt")] if not files: - print("No .txt files in documents/") + print("No .txt files found in documents folder.") for f in files: - doc = os.path.splitext(f)[0] + doc = os.path.splitext(f)[0] # filename without .txt with open(os.path.join(self.path, f), "r", encoding="utf-8") as file: self.index.add_doc(doc, file.read()) - print(f"Index built with {len(files)} docs.") + print(f"Index built with {len(files)} documents.") except FileNotFoundError: - print("documents/ folder not found.") + print("Documents folder not found!") exit() def run(self): + """Main loop for user interaction.""" while True: - user_input = input("\nEnter search query, 'back', 'show', or 'quit': ").strip() - if user_input.lower() == "quit": + user_input = input("\nEnter search query, 'back', 'show', or 'quit': ").strip().lower() + + if user_input == "quit": + print("Goodbye!") break - elif user_input.lower() == "back": + elif user_input == "back": self._back() - elif user_input.lower() == "show": + elif user_input == "show": self.history.show() - else: + elif user_input: # if user typed something self._search(user_input) - def _search(self, q): - self.history.push(q) - print(f"\nSearching: '{q}'") - self.results = self.index.search(q) + def _search(self, query): + """Handle search queries.""" + self.history.push(query) + print(f"\nSearching for: '{query}'") + self.results = self.index.search(query) if not self.results: - print("No matches.") + print("No matches found.") return - print(f"Found {len(self.results)} docs:") - for i, r in enumerate(self.results, 1): + print(f"Found {len(self.results)} document(s):") + for i, r in enumerate(self.results, start=1): print(f"{i}. {r['doc']}.txt | Score: {r['score']}") self._open_doc() def _open_doc(self): + """Open and show the contents of a selected document.""" while True: - sel = input("\nEnter doc number to open, or 'next': ").strip().lower() - if sel == "next": + choice = input("\nEnter document number to open, or 'next': ").strip().lower() + if choice == "next": break try: - i = int(sel) - if 1 <= i <= len(self.results): - doc = self.results[i - 1]["doc"] + num = int(choice) + if 1 <= num <= len(self.results): + doc = self.results[num - 1]["doc"] with open(os.path.join(self.path, f"{doc}.txt"), "r", encoding="utf-8") as f: print(f"\n--- {doc}.txt ---\n{f.read()}\n------------------") else: print("Invalid number.") except ValueError: - print("Enter a number or 'next'.") + print("Please enter a valid number or 'next'.") except FileNotFoundError: print("File not found.") def _back(self): + """Go back to the previous search query.""" if len(self.history.items) <= 1: - print("No previous search.") + print("No previous search available.") return - self.history.pop() - prev = self.history.peek() - print(f"\nBack to: '{prev}'") - self.results = self.index.search(prev) + + self.history.pop() # remove current search + prev_query = self.history.peek() # last one left + print(f"\nBack to: '{prev_query}'") + + self.results = self.index.search(prev_query) if not self.results: - print("No matches.") + print("No matches found.") else: - for i, r in enumerate(self.results, 1): + for i, r in enumerate(self.results, start=1): print(f"{i}. {r['doc']}.txt | Score: {r['score']}") self._open_doc() diff --git a/SearchEngine/stack.py b/SearchEngine/stack.py index 4ddb398..4ea08c1 100644 --- a/SearchEngine/stack.py +++ b/SearchEngine/stack.py @@ -1,4 +1,4 @@ -# stack.py +# stack.py class Stack: """Custom Stack implementation using list (LIFO).""" @@ -24,4 +24,4 @@ def peek(self): return None if self.empty() else self.items[-1] def show(self): - print("Current Stack (top → bottom):", list(reversed(self.items)) if self.items else "Empty") + print("Current Stack (top TO bottom):", list(reversed(self.items)) if self.items else "Empty")