From ef1361ec39f30caff425bb91319182c887b073f8 Mon Sep 17 00:00:00 2001 From: Aroy-Art Date: Tue, 4 Jun 2024 20:36:30 +0200 Subject: [PATCH 1/2] Add: util fuction to check if string is html --- archivist/utils/strings.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/archivist/utils/strings.py b/archivist/utils/strings.py index 408e7af..027b881 100644 --- a/archivist/utils/strings.py +++ b/archivist/utils/strings.py @@ -1,6 +1,8 @@ import math, re +from bs4 import BeautifulSoup + def get_urls(string:str = None) -> list: """ Extracts URLs from a given string using regular expressions. @@ -35,3 +37,17 @@ def convert_size(size_bytes: int) -> str: p = math.pow(1024, i) s = round(size_bytes / p, 2) return f"{s} {size_name[i]}" + + +def is_string_html(string: str) -> bool: + ''' + Check if string is HTML + + Parameters: + string (str): The string to be checked + + Returns: + bool: True if string is HTML, False otherwise + ''' + return bool(BeautifulSoup(string, "html.parser").find()) + From 39d7f64ccf064dbbe7d744a499c10d884622e0ed Mon Sep 17 00:00:00 2001 From: Aroy-Art Date: Tue, 4 Jun 2024 21:00:44 +0200 Subject: [PATCH 2/2] Add: fuction to convert html to text --- archivist/utils/strings.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/archivist/utils/strings.py b/archivist/utils/strings.py index 027b881..7e861e2 100644 --- a/archivist/utils/strings.py +++ b/archivist/utils/strings.py @@ -51,3 +51,19 @@ def is_string_html(string: str) -> bool: ''' return bool(BeautifulSoup(string, "html.parser").find()) + +def html_to_text(string): + """ + Converts an HTML string to plain text by removing all HTML tags and returning the resulting text. + + Parameters: + string (str): The HTML string to be converted. + + Returns: + str: The plain text version of the input HTML string. + """ + if is_string_html(string): + soup = BeautifulSoup(string, "html.parser") + return soup.get_text() + else: + return string