From 39d7f64ccf064dbbe7d744a499c10d884622e0ed Mon Sep 17 00:00:00 2001 From: Aroy-Art Date: Tue, 4 Jun 2024 21:00:44 +0200 Subject: [PATCH] Add: fuction to convert html to text --- archivist/utils/strings.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/archivist/utils/strings.py b/archivist/utils/strings.py index 027b881..7e861e2 100644 --- a/archivist/utils/strings.py +++ b/archivist/utils/strings.py @@ -51,3 +51,19 @@ def is_string_html(string: str) -> bool: ''' return bool(BeautifulSoup(string, "html.parser").find()) + +def html_to_text(string): + """ + Converts an HTML string to plain text by removing all HTML tags and returning the resulting text. + + Parameters: + string (str): The HTML string to be converted. + + Returns: + str: The plain text version of the input HTML string. + """ + if is_string_html(string): + soup = BeautifulSoup(string, "html.parser") + return soup.get_text() + else: + return string