[GH-ISSUE #450] UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe4 in position 10: invalid continuation byte #854

Open
opened 2026-03-13 22:49:14 +03:00 by kerem · 0 comments
Owner

Originally created by @martinxb on GitHub (Feb 16, 2023).
Original GitHub issue: https://github.com/alexal1/Insomniac/issues/450

I'm writing a basic python GUI NLP which allows a lawyer to compare the new case with old cases to find similar case file or files which can help with research and building a case of the new case they are working on.

import tkinter as tk
from tkinter import filedialog
from tkinter import messagebox
import gensim
import spacy
import os
import re
import difflib

GUI

window = tk.Tk()
window.title("Legal NLP GUI")
window.geometry("400x400")

Functions

def upload_file():
# Allow selection of any type of file
file_name = filedialog.askopenfilename(title = "Select file", filetypes = (("All files","."),("Text files",".txt"),("PDF files",".pdf"),("Doc files",".doc"),("CSV files",".csv"),("HTML files",".html"),("XLS files",".xls"),("XLSX files",".xlsx"),("JPEG files",".jpg"),("JPEG files",".jpeg"),("PNG files",".png")))
if file_name:
entry_upload.configure(state="normal")
entry_upload.delete(0,tk.END)
entry_upload.insert(0,file_name)
entry_upload.configure(state="readonly")

def upload_folder():
folder_path = filedialog.askdirectory(title="Select folder")
if folder_path:
entry_folder.configure(state="normal")
entry_folder.delete(0, tk.END)
entry_folder.insert(0, folder_path)
entry_folder.configure(state="readonly")

Allow the user to save the similar file to any directory they choose

def save_file():
save_path = filedialog.asksaveasfilename(title="Select file", filetypes=(
("All files", "."), ("Text files", ".txt"), ("PDF files", ".pdf"), ("Doc files", ".doc"),
("CSV files", "
.csv"), ("HTML files", ".html"), ("XLS files", ".xls"), ("XLSX files", ".xlsx"),
("JPG files", "
.jpg"), ("JPEG files", ".jpeg"), ("PNG files", ".png")))
if save_path:
entry_save.configure(state="normal")
entry_save.delete(0, tk.END)
entry_save.insert(0, save_path)
entry_save.configure(state="readonly")

def compare():
file_path = entry_upload.get()
folder_path = entry_folder.get()

# Read the contents of the selected file
with open(file_path, "r", encoding="utf-8") as f:
    old_case_file = f.read()

# Read the contents of the selected folder
folder_contents = ""
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, "r", encoding="utf-8") as f:
        folder_contents += f.read() + "\n"

# Get the unified diff of the two texts
diff = difflib.unified_diff(old_case_file.splitlines(), folder_contents.splitlines(), lineterm='')
diff_text = '\n'.join(list(diff))

# Clear the text box and show the diff
text_box.delete('1.0', tk.END)
text_box.insert(tk.INSERT, diff_text)

# Error checking
if not file_path or not os.path.exists(file_path):
    messagebox.showerror(title="Error", message="Please select a valid file")
    return

if not folder_path or not os.path.exists(folder_path):
    messagebox.showerror(title="Error", message="Please select a valid folder")
    return

# Load spacy model
nlp = spacy.load('en_core_web_sm')

# Read the new case file
with open(file_path, 'r') as f:
    new_case_file = f.read()

# Create an object to store the similarity scores
similarities = []

# Iterate over the files in the selected folder
for filename in os.listdir(folder_path):
    # Skip if the file is not a text file
    if not filename.endswith(".txt"):
        continue

    # Read the old case file
    with open(os.path.join(folder_path, filename), 'r') as f:
        old_case_file = f.read()

    # Preprocess the data
    new_case_file = re.sub(r'[^a-zA-Z0-9\s]', ' ', new_case_file)
    old_case_file = re.sub(r'[^a-zA-Z0-9\s]', ' ', old_case_file)

    # Tokenize the data
    new_case_file_tokens = [token.text for token in nlp(new_case_file)]
    old_case_file_tokens = [token.text for token in nlp(old_case_file)]

    # Compute similarity
    sim_score = gensim.similarities.jaccard(new_case_file_tokens, old_case_file_tokens)
    similarities.append((filename, float(sim_score)))

# Sort the similarities in descending order
similarities = sorted(similarities, key=lambda x: x[1], reverse=True)

# Show the most similar cases
if len(similarities) < 5:
    messagebox.showerror(title="Error", message="There are less than 5 similar cases")
else:
    messagebox.showinfo(title="Similar Cases",
                        message="The most similar cases are: \n\n" + "\n".join([x[0] for x in similarities[:5]]))

Widgets

label_upload = tk.Label(text="Select new case file:")
label_upload.grid(column=0, row=0)

entry_upload = tk.Entry(width=30, state="readonly")
entry_upload.grid(column=1, row=0)

button_upload = tk.Button(text="Browse", command=upload_file)
button_upload.grid(column=2, row=0)

label_folder = tk.Label(text="Select folder with old case files:")
label_folder.grid(column=0, row=1)

entry_folder = tk.Entry(width=30, state="readonly")
entry_folder.grid(column=1, row=1)

button_folder = tk.Button(text="Browse", command=upload_folder)
button_folder.grid(column=2, row=1)

button_compare = tk.Button(text="Compare", command=compare)
button_compare.grid(column=3, row=1)

label_save = tk.Label(text="Select a file path to save the compared file:")
label_save.grid(column=0, row=2)

entry_save = tk.Entry(width=30, state="readonly")
entry_save.grid(column=2, row=2)

button_save = tk.Button(text="Save", command=save_file)
button_save.grid(column=3, row=2)

text_box = tk.Text(height=10)
text_box.grid(column=1, row=4)

window.mainloop()

But when I chick on the compare button the code gives me errors even if I changed the encoding of the files to utf-8 it still gives me errors below

C:\Users\user\PycharmProjects\pythonProject1\venv\Scripts\python.exe C:\Users\user\PycharmProjects\pythonProject1\main.py
Exception in Tkinter callback
Traceback (most recent call last):
File "C:\Users\user\AppData\Local\Programs\Python\Python311\Lib\tkinter_init_.py", line 1948, in call
return self.func(*args)
^^^^^^^^^^^^^^^^
File "C:\Users\user\PycharmProjects\pythonProject1\main.py", line 61, in compare
folder_contents += f.read() + "\n"
^^^^^^^^
File "<frozen codecs>", line 322, in decode
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe4 in position 10: invalid continuation byte

Originally created by @martinxb on GitHub (Feb 16, 2023). Original GitHub issue: https://github.com/alexal1/Insomniac/issues/450 I'm writing a basic python GUI NLP which allows a lawyer to compare the new case with old cases to find similar case file or files which can help with research and building a case of the new case they are working on. import tkinter as tk from tkinter import filedialog from tkinter import messagebox import gensim import spacy import os import re import difflib # GUI window = tk.Tk() window.title("Legal NLP GUI") window.geometry("400x400") # Functions def upload_file(): \# Allow selection of any type of file file_name = filedialog.askopenfilename(title = "Select file", filetypes = (("All files","*.*"),("Text files","*.txt"),("PDF files","*.pdf"),("Doc files","*.doc"),("CSV files","*.csv"),("HTML files","*.html"),("XLS files","*.xls"),("XLSX files","*.xlsx"),("JPEG files","*.jpg"),("JPEG files","*.jpeg"),("PNG files","*.png"))) if file_name: entry_upload.configure(state="normal") entry_upload.delete(0,tk.END) entry_upload.insert(0,file_name) entry_upload.configure(state="readonly") def upload_folder(): folder_path = filedialog.askdirectory(title="Select folder") if folder_path: entry_folder.configure(state="normal") entry_folder.delete(0, tk.END) entry_folder.insert(0, folder_path) entry_folder.configure(state="readonly") # Allow the user to save the similar file to any directory they choose def save_file(): save_path = filedialog.asksaveasfilename(title="Select file", filetypes=( ("All files", "*.*"), ("Text files", "*.txt"), ("PDF files", "*.pdf"), ("Doc files", "*.doc"), ("CSV files", "*.csv"), ("HTML files", "*.html"), ("XLS files", "*.xls"), ("XLSX files", "*.xlsx"), ("JPG files", "*.jpg"), ("JPEG files", "*.jpeg"), ("PNG files", "*.png"))) if save_path: entry_save.configure(state="normal") entry_save.delete(0, tk.END) entry_save.insert(0, save_path) entry_save.configure(state="readonly") def compare(): file_path = entry_upload.get() folder_path = entry_folder.get() # Read the contents of the selected file with open(file_path, "r", encoding="utf-8") as f: old_case_file = f.read() # Read the contents of the selected folder folder_contents = "" for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) with open(file_path, "r", encoding="utf-8") as f: folder_contents += f.read() + "\n" # Get the unified diff of the two texts diff = difflib.unified_diff(old_case_file.splitlines(), folder_contents.splitlines(), lineterm='') diff_text = '\n'.join(list(diff)) # Clear the text box and show the diff text_box.delete('1.0', tk.END) text_box.insert(tk.INSERT, diff_text) # Error checking if not file_path or not os.path.exists(file_path): messagebox.showerror(title="Error", message="Please select a valid file") return if not folder_path or not os.path.exists(folder_path): messagebox.showerror(title="Error", message="Please select a valid folder") return # Load spacy model nlp = spacy.load('en_core_web_sm') # Read the new case file with open(file_path, 'r') as f: new_case_file = f.read() # Create an object to store the similarity scores similarities = [] # Iterate over the files in the selected folder for filename in os.listdir(folder_path): # Skip if the file is not a text file if not filename.endswith(".txt"): continue # Read the old case file with open(os.path.join(folder_path, filename), 'r') as f: old_case_file = f.read() # Preprocess the data new_case_file = re.sub(r'[^a-zA-Z0-9\s]', ' ', new_case_file) old_case_file = re.sub(r'[^a-zA-Z0-9\s]', ' ', old_case_file) # Tokenize the data new_case_file_tokens = [token.text for token in nlp(new_case_file)] old_case_file_tokens = [token.text for token in nlp(old_case_file)] # Compute similarity sim_score = gensim.similarities.jaccard(new_case_file_tokens, old_case_file_tokens) similarities.append((filename, float(sim_score))) # Sort the similarities in descending order similarities = sorted(similarities, key=lambda x: x[1], reverse=True) # Show the most similar cases if len(similarities) < 5: messagebox.showerror(title="Error", message="There are less than 5 similar cases") else: messagebox.showinfo(title="Similar Cases", message="The most similar cases are: \n\n" + "\n".join([x[0] for x in similarities[:5]])) # Widgets label_upload = tk.Label(text="Select new case file:") label_upload.grid(column=0, row=0) entry_upload = tk.Entry(width=30, state="readonly") entry_upload.grid(column=1, row=0) button_upload = tk.Button(text="Browse", command=upload_file) button_upload.grid(column=2, row=0) label_folder = tk.Label(text="Select folder with old case files:") label_folder.grid(column=0, row=1) entry_folder = tk.Entry(width=30, state="readonly") entry_folder.grid(column=1, row=1) button_folder = tk.Button(text="Browse", command=upload_folder) button_folder.grid(column=2, row=1) button_compare = tk.Button(text="Compare", command=compare) button_compare.grid(column=3, row=1) label_save = tk.Label(text="Select a file path to save the compared file:") label_save.grid(column=0, row=2) entry_save = tk.Entry(width=30, state="readonly") entry_save.grid(column=2, row=2) button_save = tk.Button(text="Save", command=save_file) button_save.grid(column=3, row=2) text_box = tk.Text(height=10) text_box.grid(column=1, row=4) window.mainloop() But when I chick on the compare button the code gives me errors even if I changed the encoding of the files to utf-8 it still gives me errors below C:\\Users\\user\\PycharmProjects\\pythonProject1\\venv\\Scripts\\python.exe C:\\Users\\user\\PycharmProjects\\pythonProject1\\main.py Exception in Tkinter callback Traceback (most recent call last): File "C:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\tkinter\__init_\_.py", line 1948, in __call__ return self.func(\*args) ^^^^^^^^^^^^^^^^ File "C:\\Users\\user\\PycharmProjects\\pythonProject1\\main.py", line 61, in compare folder_contents += f.read() + "\\n" ^^^^^^^^ File "\<frozen codecs\>", line 322, in decode UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe4 in position 10: invalid continuation byte
Sign in to join this conversation.
No milestone
No project
No assignees
1 participant
Notifications
Due date
The due date is invalid or out of range. Please use the format "yyyy-mm-dd".

No due date set.

Dependencies

No dependencies set.

Reference
starred/Insomniac#854
No description provided.