Gallery-Archivist/archivist/utils/strings.py


import math, re

from bs4 import BeautifulSoup

def get_urls(string:str = None) -> list:
    """
    Extracts URLs from a given string using regular expressions.

    Parameters:
    string (str): The input string from which URLs need to be extracted.

    Returns:
    list: A list of URLs extracted from the input string.
    """
    regex = re.compile(
        r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w_]*)?\??(?:[-\+=&;%@.\w_]*)#?(?:[.\!\/\\w]*))?)',
        re.IGNORECASE)

    return [x[0] for x in re.findall(regex, string)]


def convert_size(size_bytes: int) -> str:
    """
    A function that converts the given size in bytes to a human-readable format.

    Parameters:
        size_bytes (int): An integer representing the size in bytes to be converted.

    Returns:
        A string representing the converted size with the appropriate unit (B, KiB, MiB, GiB, etc.).
    """
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return f"{s} {size_name[i]}"


def is_string_html(string: str) -> bool:
    '''
    Check if string is HTML
    
    Parameters:
        string (str): The string to be checked
    
    Returns:
        bool: True if string is HTML, False otherwise
    '''
    return bool(BeautifulSoup(string, "html.parser").find())


def html_to_text(string):
    """
    Converts an HTML string to plain text by removing all HTML tags and returning the resulting text.
    
    Parameters:
        string (str): The HTML string to be converted.
        
    Returns:
        str: The plain text version of the input HTML string.
    """
    if is_string_html(string):
        soup = BeautifulSoup(string, "html.parser")
        return soup.get_text()
    else:
        return string
Add: util func to get all urls in a string 2024-03-14 19:46:19 +01:00
Add: utli func for converting bytes to human readable formats 2024-04-29 21:03:15 +02:00			`import math, re`
Add: util func to get all urls in a string 2024-03-14 19:46:19 +01:00
Add: util fuction to check if string is html 2024-06-04 20:36:30 +02:00			`from bs4 import BeautifulSoup`

Update: Docs string to include parameters & return 2024-03-18 12:40:43 +01:00			`def get_urls(string:str = None) -> list:`
			`"""`
			`Extracts URLs from a given string using regular expressions.`

			`Parameters:`
			`string (str): The input string from which URLs need to be extracted.`

			`Returns:`
			`list: A list of URLs extracted from the input string.`
			`"""`
Add: util func to get all urls in a string 2024-03-14 19:46:19 +01:00			`regex = re.compile(`
			`r'((([A-Za-z]{3,9}:(?:\/\/)?)(?:[-;:&=\+\$,\w]+@)?[A-Za-z0-9.-]+\|(?:www.\|[-;:&=\+\$,\w]+@)[A-Za-z0-9.-]+)((?:\/[\+~%\/.\w_])?\??(?:[-\+=&;%@.\w_])#?(?:[.\!\/\\w]*))?)',`
			`re.IGNORECASE)`

			`return [x[0] for x in re.findall(regex, string)]`


Add: utli func for converting bytes to human readable formats 2024-04-29 21:03:15 +02:00			`def convert_size(size_bytes: int) -> str:`
Fix: indenting 2024-10-20 10:40:09 +02:00			`"""`
			`A function that converts the given size in bytes to a human-readable format.`

			`Parameters:`
			`size_bytes (int): An integer representing the size in bytes to be converted.`

			`Returns:`
			`A string representing the converted size with the appropriate unit (B, KiB, MiB, GiB, etc.).`
			`"""`
			`if size_bytes == 0:`
			`return "0B"`
			`size_name = ("B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB")`
			`i = int(math.floor(math.log(size_bytes, 1024)))`
			`p = math.pow(1024, i)`
			`s = round(size_bytes / p, 2)`
			`return f"{s} {size_name[i]}"`
Add: util fuction to check if string is html 2024-06-04 20:36:30 +02:00

			`def is_string_html(string: str) -> bool:`
			`'''`
			`Check if string is HTML`

			`Parameters:`
			`string (str): The string to be checked`

			`Returns:`
			`bool: True if string is HTML, False otherwise`
			`'''`
			`return bool(BeautifulSoup(string, "html.parser").find())`

Add: fuction to convert html to text 2024-06-04 21:00:44 +02:00
			`def html_to_text(string):`
			`"""`
			`Converts an HTML string to plain text by removing all HTML tags and returning the resulting text.`

			`Parameters:`
			`string (str): The HTML string to be converted.`

			`Returns:`
			`str: The plain text version of the input HTML string.`
			`"""`
			`if is_string_html(string):`
			`soup = BeautifulSoup(string, "html.parser")`
			`return soup.get_text()`
			`else:`
			`return string`