diff --git a/archivist/apps/sites/management/commands/import_data.py b/archivist/apps/sites/management/commands/import_data.py new file mode 100644 index 0000000..636d70c --- /dev/null +++ b/archivist/apps/sites/management/commands/import_data.py @@ -0,0 +1,559 @@ +# /management/commands/import_data.py + +import os +import json +import requests +from blake3 import blake3 +from tqdm.auto import tqdm +from PIL import Image + +from datetime import datetime + +from django.core.management.base import BaseCommand +from django.core.files.base import ContentFile +from django.utils.text import slugify +from django.utils import timezone +from django.core.exceptions import ObjectDoesNotExist + +from django.contrib.contenttypes.models import ContentType + + +from apps.files.models import User_Profile_Images, User_Banner_Images, Submission_File, Metadata_Files + +from apps.sites.models import Category, Submissions, Users, Tags + +from apps.sites.furaffinity.models import FA_Submission, FA_Tags, FA_User, FA_Species, FA_Gender, FA_Mature +from apps.sites.twitter.models import Twitter_Submissions, Twitter_Users, Twitter_Tags + +from utils.files import get_mime_type +from utils.strings import get_urls + + +class Command(BaseCommand): + help = 'Import data from JSON files in a folder or a single JSON file to the Twitter archive' + + def add_arguments(self, parser): + parser.add_argument('path', type=str, help='Path to the folder containing JSON files or a single JSON file') + parser.add_argument('--delete', action='store_true', help='Delete imported files') + + + def handle(self, *args, **kwargs): + path = kwargs['path'] + delete = kwargs['delete'] + + if os.path.isfile(path): + self.process_json_file(path, delete) + elif os.path.isdir(path): + self.process_json_folder(path, delete) + else: + self.stdout.write(self.style.ERROR(f"The path '{path}' is not a valid file or folder.")) + return + + + def process_json_file(self, file_path, delete): + #self.stdout.write(self.style.NOTICE(f"Importing data from: {file_path}")) + tqdm.write(f"Importing data from: {file_path}") + + with open(file_path) as f: + data = json.load(f) + + self.import_data(data, file_path, delete) + + tqdm.write(self.style.SUCCESS('Data imported successfully.')) + + + def process_json_folder(self, folder_path, delete): + if not os.path.exists(folder_path): + #self.stdout.write(self.style.ERROR(f"The folder '{folder_path}' does not exist.")) + tqdm.write(self.style.ERROR(f"The folder '{folder_path}' does not exist.")) + return + + for root, dirs, files in tqdm(os.walk(folder_path), dynamic_ncols=True): + for file_name in files: + if file_name.endswith('.json'): + file_path = os.path.join(root, file_name) + self.process_json_file(file_path, delete) + + + def compute_file_hash(self, file_path): + """ Compute BLAKE3 hash of the file """ + try: + hasher = blake3() + with open(file_path, 'rb') as f: + while chunk := f.read(65536): + hasher.update(chunk) + return hasher.hexdigest() + except Exception as e: + tqdm.write(self.style.WARNING(f"Error computing file hash: {e}")) + return None + def compute_string_hash(self, string): + """ Compute BLAKE3 hash of the string """ + try: + hasher = blake3() + hasher.update(string.encode()) + return hasher.hexdigest() + except Exception as e: + tqdm.write(self.style.WARNING(f"Error computing string hash: {e}")) + return None + def import_file(self, file_path, model, delete=False): + """ + Imports a file if it doesn't already exist in the database and returns the instance. + + :param file_path: The path to the file to import. + :param model: The model class to which the file instance should be linked. + :param delete: Whether to delete the imported file after processing. + :return: The file instance. + """ + + file_instance = None # Initialize file_instance to None + + if os.path.exists(file_path): + file_hash = self.compute_file_hash(file_path) + + file_name = os.path.basename(file_path) + Null, file_ext = os.path.splitext(file_name) + hash_file_name = file_hash + file_ext + + try: + file_instance = model.objects.get(file_hash=file_hash) + + file_instance.file_ext = file_ext + file_instance.size = os.path.getsize(file_path) + file_instance.file_mime = get_mime_type(file_path) + + if file_instance.file_mime.startswith("image/"): + im = Image.open(file_instance.file) + file_instance.image_height, file_instance.image_width = im.size + else: + file_instance.image_height = None + file_instance.image_width = None + + file_instance.save() + + tqdm.write(self.style.NOTICE(f"Skipping: {file_path} file, already imported")) + + except model.DoesNotExist: + # If the file doesn't exist, create a new file instance + with open(file_path, 'rb') as file: + file_instance = model() + file_instance.file_hash = file_hash + + file_instance.file.save(hash_file_name, file) + + file_instance.file_ext = file_ext + file_instance.file_mime = get_mime_type(file_path) + file_instance.size = os.path.getsize(file_path) + + if file_instance.file_mime.startswith("image/"): + im = Image.open(file_instance.file) + file_instance.image_height, file_instance.image_width = im.size + else: + file_instance.image_height = None + file_instance.image_width = None + + file_instance.file_name = file_name + file_instance.save() + + tqdm.write(self.style.NOTICE(f"Import file: {file_path}")) + + if delete: + self.delete_imported_file(file_path) + + return file_instance + + + def delete_imported_file(self, file_path, delete=False): + """ + Delete the file if the --delete flag is used + + :param delete: Whether to delete the imported file after processing. + """ + if delete: + if os.path.exists(file_path): + os.remove(file_path) + tqdm.write(self.style.SUCCESS(f"Deleted: {file_path}")) + else: + tqdm.write(self.style.WARNING(f"File not found: {file_path}")) + + + def import_data(self, data, json_file_path, delete): + + category = data['category'] + + if category == "twitter": + self.import_from_twitter(data, json_file_path, delete) + + elif category == "furaffinity": + self.import_from_furaffinity(data, json_file_path, delete) + + else: + tqdm.write(f"Skipping '{category}' not implemented") + + def import_twitter_user(self, data, file_path, category, delete=False): + """ + Import a Twitter user from the provided data into the database. + + Parameters: + data (dict): The data containing information about the Twitter user. + file_path (str): The file path for importing user images. + delete (bool): Flag indicating whether to delete user images after importing it. + + Returns: + Twitter_Users: The Twitter user object imported or retrieved from the database. + """ + + content_type = ContentType.objects.get_for_model(Twitter_Users) + + author, created = Twitter_Users.objects.get_or_create(artist_id=data['author']['id']) + + author.artist = data['author']['nick'] + author.artist_url = data['author']['name'] + author.date = timezone.make_aware(datetime.strptime(data['author']["date"], "%Y-%m-%d %H:%M:%S")) + author.description = data['author']['description'] + if 'url' in data['author'].keys(): + author.extra_url = data['author']['url'] + author.location = data['author']['location'] + author.verified = data['author']['verified'] + + if author.favourites_count == None or data['author']["favourites_count"] > author.favourites_count: + author.favourites_count = data['author']["favourites_count"] + if author.followers_count == None or data['author']["followers_count"] > author.followers_count: + author.followers_count = data['author']["followers_count"] + if author.friends_count == None or data['author']["friends_count"] > author.friends_count: + author.friends_count = data['author']["friends_count"] + if author.media_count == None or data['author']["media_count"] > author.media_count: + author.media_count = data['author']["media_count"] + if author.listed_count == None or data['author']["listed_count"] > author.listed_count: + author.listed_count = data['author']["listed_count"] + if author.statuses_count == None or data['author']["statuses_count"] > author.statuses_count: + author.statuses_count = data['author']["statuses_count"] + + if data['subcategory'] == "avatar": + author.profile_image = data['author']['profile_image'] + + author.icon = self.import_file(file_path, User_Profile_Images, delete) + + elif data['subcategory'] == "background": + + author.profile_banner = data['author']['profile_banner'] + + author.banner = self.import_file(file_path, User_Banner_Images, delete) + + + author_hash = self.compute_string_hash(data['author']['name'] + data['category']) + + site_user, created = Users.objects.get_or_create(user_hash=author_hash) + + site_user.category = category + + + # Get the primary key of the twitter_submission instance + site_user_id = author.pk + + # Create the SubmissionsLink instance + site_user.content_type=content_type + site_user.object_id=site_user_id + + site_user.save() + + author.save() + + return author, site_user + + + def import_twitter_tags(self, data: dict, category: str) -> list[Twitter_Tags]: + """ + Import a Twitter tag from the provided data into the database. + + Parameters: + data (dict): The data containing information about the Twitter tag. + + Returns: + list[Twitter_Tags]: A list of imported or retrieved Twitter tag objects. + """ + + content_type = ContentType.objects.get_for_model(Twitter_Tags) + + tags: list[Twitter_Tags] = [] + + if "hashtags" in data: + for t_tag_name in data["hashtags"]: + t_tag_slug = slugify(t_tag_name) + try: + # Check if the tag already exists in the database by name + tag: Twitter_Tags = Twitter_Tags.objects.get(tag_slug=t_tag_slug) + + tag_id = tag.pk + + except ObjectDoesNotExist: + # If the tag does not exist, create a new tag and generate the slug + tag = Twitter_Tags(tag=t_tag_name) + tag.tag_slug = t_tag_slug + + tag_id = tag.pk + + site_tags, created = Tags.objects.get_or_create(tag_slug=t_tag_slug) + + site_tags.category.add(category) + + site_tags.content_type=content_type + site_tags.object_id=tag_id + + site_tags.save() + + tag.save() # Save the tag (either new or existing) + + tags.append(tag) + + return tags + + + def import_from_twitter(self, data, json_file_path, delete): + + category, created = Category.objects.get_or_create(name=data['category']) + + category.save() + + twitter_submission, created = Twitter_Submissions.objects.get_or_create(submission_id=data["tweet_id"]) + + file_path = json_file_path.removesuffix(".json") + + # Handle author import + author, site_user = self.import_twitter_user(data, file_path, category, delete) + + twitter_submission.author = author + + # Handle tag import + tags = self.import_twitter_tags(data, category) + + for tag in tags: + twitter_submission.tags.add(tag) # Add the tag to the submission + + twitter_submission.gallery_type = data['subcategory'] + + # Handle file import + twitter_submission.files.add(self.import_file(file_path, Submission_File, delete)) + + # Handle metadata file import + twitter_submission.metadata.add(self.import_file(json_file_path, Metadata_Files, delete)) + + twitter_submission.description = data['content'] + + twitter_submission.date = timezone.make_aware(datetime.strptime(data['date'], "%Y-%m-%d %H:%M:%S")) + + twitter_submission.origin_site = data['category'] + + twitter_submission.file_extension = data['extension'] + + twitter_submission.origin_filename = data['filename'] + + if twitter_submission.media_num is None or data['num'] > twitter_submission.media_num: + twitter_submission.media_num = data['num'] + + if "height" in data.keys(): + twitter_submission.image_height = data['height'] + if "width" in data.keys(): + twitter_submission.image_width = data['width'] + + if "sensitive" in data.keys(): + twitter_submission.sensitive = data['sensitive'] + + if "favorite_count" in data.keys(): + twitter_submission.favorites_count = data['favorite_count'] + + if "quote_count" in data.keys(): + twitter_submission.quote_count = data['quote_count'] + + if "reply_count" in data.keys(): + twitter_submission.reply_count = data['reply_count'] + + if "retweet_count" in data.keys(): + twitter_submission.retweet_count = data['retweet_count'] + + twitter_submission.lang = data['lang'] + + twitter_submission.save() + + submission_hash = self.compute_string_hash(category.name + data['author']['name'] + str(data["tweet_id"])) + + submission, created = Submissions.objects.get_or_create(submission_hash=submission_hash) + + submission.category = category + + submission.author = site_user + + if twitter_submission.sensitive is not None: + submission.mature = twitter_submission.sensitive + else: + submission.mature = False + + submission.date = timezone.make_aware(datetime.strptime(data['date'], "%Y-%m-%d %H:%M:%S")) + + content_type = ContentType.objects.get_for_model(Twitter_Submissions) + + # Get the primary key of the twitter_submission instance + twitter_submission_id = twitter_submission.pk + + # Create the SubmissionsLink instance + submission.content_type=content_type + submission.object_id=twitter_submission_id + + submission.save() + + self.delete_imported_file(json_file_path, delete) + self.delete_imported_file(file_path, delete) + + + def import_furaffinity_user(self, data, json_file_path, category, delete): + content_type = ContentType.objects.get_for_model(FA_User) + + artist, created = FA_User.objects.get_or_create(artist_url=data["artist_url"], artist=data["artist"]) + + author_hash = self.compute_string_hash(data["artist_url"] + data['category']) + + site_user, created = Users.objects.get_or_create(user_hash=author_hash) + + site_user.category = category + + + # Get the primary key of the furaffinity_submission instance + site_user_id = artist.pk + + # Create the SubmissionsLink instance + site_user.content_type=content_type + site_user.object_id=site_user_id + + site_user.save() + + return artist, site_user + + + def import_furaffinity_tags(self, data, category): + + content_type = ContentType.objects.get_for_model(FA_Tags) + + tags: list[FA_Tags] = [] + + site_tags: list[Tags] = [] + + if "tags" in data: + for t_tag_name in data["tags"]: + t_tag_slug = slugify(t_tag_name) + try: + # Check if the tag already exists in the database by name + tag: FA_Tags = FA_Tags.objects.get(tag_slug=t_tag_slug) + + tag_id = tag.pk + + except ObjectDoesNotExist: + # If the tag does not exist, create a new tag and generate the slug + tag = FA_Tags(tag=t_tag_name) + tag.tag_slug = t_tag_slug + + tag_id = tag.pk + + site_tag, created = Tags.objects.get_or_create(tag_slug=t_tag_slug) + + site_tag.category.add(category) + + site_tag.content_type=content_type + site_tag.object_id=tag_id + + site_tag.save() + + tag.save() # Save the tag (either new or existing) + + tags.append(tag) + + site_tags.append(site_tag) + + return tags, site_tags + + + def import_from_furaffinity(self, data, json_file_path, delete): + + category, created = Category.objects.get_or_create(name=data['category']) + + category.save() + + furaffinity_submission, created = FA_Submission.objects.get_or_create(submission_id=data["id"]) + + furaffinity_submission.media_url = data["url"] + furaffinity_submission.title = data["title"] + furaffinity_submission.description = data["description"] + + furaffinity_submission.date = timezone.make_aware(datetime.strptime(data["date"], "%Y-%m-%d %H:%M:%S")) + + file_path = json_file_path.removesuffix(".json") + + # Handle author import + author, site_user = self.import_furaffinity_user(data, file_path, category, delete) + + furaffinity_submission.artist = author + + # Handle tag import + tags, site_tags = self.import_furaffinity_tags(data, category) + + for tag in tags: + furaffinity_submission.tags.add(tag) # Add the tag to the submission + + species, created = FA_Species.objects.get_or_create(species=data["species"]) + furaffinity_submission.species = species + + # Handle mature rating import + mature, created = FA_Mature.objects.get_or_create(mature=data["rating"]) + furaffinity_submission.mature_rating = mature + + furaffinity_submission.number_of_comments = data["comments"] + furaffinity_submission.views = data["views"] + + gender, created = FA_Gender.objects.get_or_create(gender=data["gender"]) + furaffinity_submission.gender = gender + + furaffinity_submission.fa_theme = data["theme"] + furaffinity_submission.fa_category = data["fa_category"] + furaffinity_submission.gallery_type = data["subcategory"] + furaffinity_submission.file_extension = data["extension"] + furaffinity_submission.image_height = data["height"] + furaffinity_submission.image_width = data["width"] + + # Handle file import + furaffinity_submission.files.add(self.import_file(file_path, Submission_File, delete)) + + # Handle metadata file import + furaffinity_submission.metadata.add(self.import_file(json_file_path, Metadata_Files, delete)) + + furaffinity_submission.save() + + submission_hash = self.compute_string_hash(category.name + data["artist_url"] + str(data["id"])) + + submission, created = Submissions.objects.get_or_create(submission_hash=submission_hash) + + submission.category = category + + submission.tags.add(*site_tags) + + submission.author = site_user + + if furaffinity_submission.mature_rating.mature != "General" and not None: + print("Mature") + submission.mature = True + else: + submission.mature = False + + submission.date = timezone.make_aware(datetime.strptime(data['date'], "%Y-%m-%d %H:%M:%S")) + + content_type = ContentType.objects.get_for_model(FA_Submission) + + # Get the primary key of the twitter_submission instance + furaffinity_submission_id = furaffinity_submission.pk + + # Create the SubmissionsLink instance + submission.content_type=content_type + submission.object_id=furaffinity_submission_id + + submission.save() + + self.delete_imported_file(json_file_path, delete) + self.delete_imported_file(file_path, delete) + \ No newline at end of file