diff --git a/archivist/utils/strings.py b/archivist/utils/strings.py index 408e7af..7e861e2 100644 --- a/archivist/utils/strings.py +++ b/archivist/utils/strings.py @@ -1,6 +1,8 @@ import math, re +from bs4 import BeautifulSoup + def get_urls(string:str = None) -> list: """ Extracts URLs from a given string using regular expressions. @@ -35,3 +37,33 @@ def convert_size(size_bytes: int) -> str: p = math.pow(1024, i) s = round(size_bytes / p, 2) return f"{s} {size_name[i]}" + + +def is_string_html(string: str) -> bool: + ''' + Check if string is HTML + + Parameters: + string (str): The string to be checked + + Returns: + bool: True if string is HTML, False otherwise + ''' + return bool(BeautifulSoup(string, "html.parser").find()) + + +def html_to_text(string): + """ + Converts an HTML string to plain text by removing all HTML tags and returning the resulting text. + + Parameters: + string (str): The HTML string to be converted. + + Returns: + str: The plain text version of the input HTML string. + """ + if is_string_html(string): + soup = BeautifulSoup(string, "html.parser") + return soup.get_text() + else: + return string