diff --git a/archivist/utils/strings.py b/archivist/utils/strings.py index 027b881..7e861e2 100644 --- a/archivist/utils/strings.py +++ b/archivist/utils/strings.py @@ -51,3 +51,19 @@ def is_string_html(string: str) -> bool: ''' return bool(BeautifulSoup(string, "html.parser").find()) + +def html_to_text(string): + """ + Converts an HTML string to plain text by removing all HTML tags and returning the resulting text. + + Parameters: + string (str): The HTML string to be converted. + + Returns: + str: The plain text version of the input HTML string. + """ + if is_string_html(string): + soup = BeautifulSoup(string, "html.parser") + return soup.get_text() + else: + return string