Gallery-Archivist/archivist/apps/sites/management/commands/import_data.py

559 lines
21 KiB
Python
Raw Permalink Normal View History

# /management/commands/import_data.py
import os
import json
import requests
from blake3 import blake3
from tqdm.auto import tqdm
from PIL import Image
from datetime import datetime
from django.core.management.base import BaseCommand
from django.core.files.base import ContentFile
from django.utils.text import slugify
from django.utils import timezone
from django.core.exceptions import ObjectDoesNotExist
from django.contrib.contenttypes.models import ContentType
from apps.files.models import User_Profile_Images, User_Banner_Images, Submission_File, Metadata_Files
from apps.sites.models import Category, Submissions, Users, Tags
from apps.sites.furaffinity.models import FA_Submission, FA_Tags, FA_User, FA_Species, FA_Gender, FA_Mature
from apps.sites.twitter.models import Twitter_Submissions, Twitter_Users, Twitter_Tags
from utils.files import get_mime_type
from utils.strings import get_urls
class Command(BaseCommand):
help = 'Import data from JSON files in a folder or a single JSON file to the Twitter archive'
def add_arguments(self, parser):
parser.add_argument('path', type=str, help='Path to the folder containing JSON files or a single JSON file')
parser.add_argument('--delete', action='store_true', help='Delete imported files')
def handle(self, *args, **kwargs):
path = kwargs['path']
delete = kwargs['delete']
if os.path.isfile(path):
self.process_json_file(path, delete)
elif os.path.isdir(path):
self.process_json_folder(path, delete)
else:
self.stdout.write(self.style.ERROR(f"The path '{path}' is not a valid file or folder."))
return
def process_json_file(self, file_path, delete):
#self.stdout.write(self.style.NOTICE(f"Importing data from: {file_path}"))
tqdm.write(f"Importing data from: {file_path}")
with open(file_path) as f:
data = json.load(f)
self.import_data(data, file_path, delete)
tqdm.write(self.style.SUCCESS('Data imported successfully.'))
def process_json_folder(self, folder_path, delete):
if not os.path.exists(folder_path):
#self.stdout.write(self.style.ERROR(f"The folder '{folder_path}' does not exist."))
tqdm.write(self.style.ERROR(f"The folder '{folder_path}' does not exist."))
return
for root, dirs, files in tqdm(os.walk(folder_path), dynamic_ncols=True):
for file_name in files:
if file_name.endswith('.json'):
file_path = os.path.join(root, file_name)
self.process_json_file(file_path, delete)
def compute_file_hash(self, file_path):
""" Compute BLAKE3 hash of the file """
try:
hasher = blake3()
with open(file_path, 'rb') as f:
while chunk := f.read(65536):
hasher.update(chunk)
return hasher.hexdigest()
except Exception as e:
tqdm.write(self.style.WARNING(f"Error computing file hash: {e}"))
return None
def compute_string_hash(self, string):
""" Compute BLAKE3 hash of the string """
try:
hasher = blake3()
hasher.update(string.encode())
return hasher.hexdigest()
except Exception as e:
tqdm.write(self.style.WARNING(f"Error computing string hash: {e}"))
return None
def import_file(self, file_path, model, delete=False):
"""
Imports a file if it doesn't already exist in the database and returns the instance.
:param file_path: The path to the file to import.
:param model: The model class to which the file instance should be linked.
:param delete: Whether to delete the imported file after processing.
:return: The file instance.
"""
file_instance = None # Initialize file_instance to None
if os.path.exists(file_path):
file_hash = self.compute_file_hash(file_path)
file_name = os.path.basename(file_path)
Null, file_ext = os.path.splitext(file_name)
hash_file_name = file_hash + file_ext
try:
file_instance = model.objects.get(file_hash=file_hash)
file_instance.file_ext = file_ext
file_instance.size = os.path.getsize(file_path)
file_instance.file_mime = get_mime_type(file_path)
if file_instance.file_mime.startswith("image/"):
im = Image.open(file_instance.file)
file_instance.image_height, file_instance.image_width = im.size
else:
file_instance.image_height = None
file_instance.image_width = None
file_instance.save()
tqdm.write(self.style.NOTICE(f"Skipping: {file_path} file, already imported"))
except model.DoesNotExist:
# If the file doesn't exist, create a new file instance
with open(file_path, 'rb') as file:
file_instance = model()
file_instance.file_hash = file_hash
file_instance.file.save(hash_file_name, file)
file_instance.file_ext = file_ext
file_instance.file_mime = get_mime_type(file_path)
file_instance.size = os.path.getsize(file_path)
if file_instance.file_mime.startswith("image/"):
im = Image.open(file_instance.file)
file_instance.image_height, file_instance.image_width = im.size
else:
file_instance.image_height = None
file_instance.image_width = None
file_instance.file_name = file_name
file_instance.save()
tqdm.write(self.style.NOTICE(f"Import file: {file_path}"))
if delete:
self.delete_imported_file(file_path)
return file_instance
def delete_imported_file(self, file_path, delete=False):
"""
Delete the file if the --delete flag is used
:param delete: Whether to delete the imported file after processing.
"""
if delete:
if os.path.exists(file_path):
os.remove(file_path)
tqdm.write(self.style.SUCCESS(f"Deleted: {file_path}"))
else:
tqdm.write(self.style.WARNING(f"File not found: {file_path}"))
def import_data(self, data, json_file_path, delete):
category = data['category']
if category == "twitter":
self.import_from_twitter(data, json_file_path, delete)
elif category == "furaffinity":
self.import_from_furaffinity(data, json_file_path, delete)
else:
tqdm.write(f"Skipping '{category}' not implemented")
def import_twitter_user(self, data, file_path, category, delete=False):
"""
Import a Twitter user from the provided data into the database.
Parameters:
data (dict): The data containing information about the Twitter user.
file_path (str): The file path for importing user images.
delete (bool): Flag indicating whether to delete user images after importing it.
Returns:
Twitter_Users: The Twitter user object imported or retrieved from the database.
"""
content_type = ContentType.objects.get_for_model(Twitter_Users)
author, created = Twitter_Users.objects.get_or_create(artist_id=data['author']['id'])
author.artist = data['author']['nick']
author.artist_url = data['author']['name']
author.date = timezone.make_aware(datetime.strptime(data['author']["date"], "%Y-%m-%d %H:%M:%S"))
author.description = data['author']['description']
if 'url' in data['author'].keys():
author.extra_url = data['author']['url']
author.location = data['author']['location']
author.verified = data['author']['verified']
if author.favourites_count == None or data['author']["favourites_count"] > author.favourites_count:
author.favourites_count = data['author']["favourites_count"]
if author.followers_count == None or data['author']["followers_count"] > author.followers_count:
author.followers_count = data['author']["followers_count"]
if author.friends_count == None or data['author']["friends_count"] > author.friends_count:
author.friends_count = data['author']["friends_count"]
if author.media_count == None or data['author']["media_count"] > author.media_count:
author.media_count = data['author']["media_count"]
if author.listed_count == None or data['author']["listed_count"] > author.listed_count:
author.listed_count = data['author']["listed_count"]
if author.statuses_count == None or data['author']["statuses_count"] > author.statuses_count:
author.statuses_count = data['author']["statuses_count"]
if data['subcategory'] == "avatar":
author.profile_image = data['author']['profile_image']
author.icon = self.import_file(file_path, User_Profile_Images, delete)
elif data['subcategory'] == "background":
author.profile_banner = data['author']['profile_banner']
author.banner = self.import_file(file_path, User_Banner_Images, delete)
author_hash = self.compute_string_hash(data['author']['name'] + data['category'])
site_user, created = Users.objects.get_or_create(user_hash=author_hash)
site_user.category = category
# Get the primary key of the twitter_submission instance
site_user_id = author.pk
# Create the SubmissionsLink instance
site_user.content_type=content_type
site_user.object_id=site_user_id
site_user.save()
author.save()
return author, site_user
def import_twitter_tags(self, data: dict, category: str) -> list[Twitter_Tags]:
"""
Import a Twitter tag from the provided data into the database.
Parameters:
data (dict): The data containing information about the Twitter tag.
Returns:
list[Twitter_Tags]: A list of imported or retrieved Twitter tag objects.
"""
content_type = ContentType.objects.get_for_model(Twitter_Tags)
tags: list[Twitter_Tags] = []
if "hashtags" in data:
for t_tag_name in data["hashtags"]:
t_tag_slug = slugify(t_tag_name)
try:
# Check if the tag already exists in the database by name
tag: Twitter_Tags = Twitter_Tags.objects.get(tag_slug=t_tag_slug)
tag_id = tag.pk
except ObjectDoesNotExist:
# If the tag does not exist, create a new tag and generate the slug
tag = Twitter_Tags(tag=t_tag_name)
tag.tag_slug = t_tag_slug
tag_id = tag.pk
site_tags, created = Tags.objects.get_or_create(tag_slug=t_tag_slug)
site_tags.category.add(category)
site_tags.content_type=content_type
site_tags.object_id=tag_id
site_tags.save()
tag.save() # Save the tag (either new or existing)
tags.append(tag)
return tags
def import_from_twitter(self, data, json_file_path, delete):
category, created = Category.objects.get_or_create(name=data['category'])
category.save()
twitter_submission, created = Twitter_Submissions.objects.get_or_create(submission_id=data["tweet_id"])
file_path = json_file_path.removesuffix(".json")
# Handle author import
author, site_user = self.import_twitter_user(data, file_path, category, delete)
twitter_submission.author = author
# Handle tag import
tags = self.import_twitter_tags(data, category)
for tag in tags:
twitter_submission.tags.add(tag) # Add the tag to the submission
twitter_submission.gallery_type = data['subcategory']
# Handle file import
twitter_submission.files.add(self.import_file(file_path, Submission_File, delete))
# Handle metadata file import
twitter_submission.metadata.add(self.import_file(json_file_path, Metadata_Files, delete))
twitter_submission.description = data['content']
twitter_submission.date = timezone.make_aware(datetime.strptime(data['date'], "%Y-%m-%d %H:%M:%S"))
twitter_submission.origin_site = data['category']
twitter_submission.file_extension = data['extension']
twitter_submission.origin_filename = data['filename']
if twitter_submission.media_num is None or data['num'] > twitter_submission.media_num:
twitter_submission.media_num = data['num']
if "height" in data.keys():
twitter_submission.image_height = data['height']
if "width" in data.keys():
twitter_submission.image_width = data['width']
if "sensitive" in data.keys():
twitter_submission.sensitive = data['sensitive']
if "favorite_count" in data.keys():
twitter_submission.favorites_count = data['favorite_count']
if "quote_count" in data.keys():
twitter_submission.quote_count = data['quote_count']
if "reply_count" in data.keys():
twitter_submission.reply_count = data['reply_count']
if "retweet_count" in data.keys():
twitter_submission.retweet_count = data['retweet_count']
twitter_submission.lang = data['lang']
twitter_submission.save()
submission_hash = self.compute_string_hash(category.name + data['author']['name'] + str(data["tweet_id"]))
submission, created = Submissions.objects.get_or_create(submission_hash=submission_hash)
submission.category = category
submission.author = site_user
if twitter_submission.sensitive is not None:
submission.mature = twitter_submission.sensitive
else:
submission.mature = False
submission.date = timezone.make_aware(datetime.strptime(data['date'], "%Y-%m-%d %H:%M:%S"))
content_type = ContentType.objects.get_for_model(Twitter_Submissions)
# Get the primary key of the twitter_submission instance
twitter_submission_id = twitter_submission.pk
# Create the SubmissionsLink instance
submission.content_type=content_type
submission.object_id=twitter_submission_id
submission.save()
self.delete_imported_file(json_file_path, delete)
self.delete_imported_file(file_path, delete)
def import_furaffinity_user(self, data, json_file_path, category, delete):
content_type = ContentType.objects.get_for_model(FA_User)
artist, created = FA_User.objects.get_or_create(artist_url=data["artist_url"], artist=data["artist"])
author_hash = self.compute_string_hash(data["artist_url"] + data['category'])
site_user, created = Users.objects.get_or_create(user_hash=author_hash)
site_user.category = category
# Get the primary key of the furaffinity_submission instance
site_user_id = artist.pk
# Create the SubmissionsLink instance
site_user.content_type=content_type
site_user.object_id=site_user_id
site_user.save()
return artist, site_user
def import_furaffinity_tags(self, data, category):
content_type = ContentType.objects.get_for_model(FA_Tags)
tags: list[FA_Tags] = []
site_tags: list[Tags] = []
if "tags" in data:
for t_tag_name in data["tags"]:
t_tag_slug = slugify(t_tag_name)
try:
# Check if the tag already exists in the database by name
tag: FA_Tags = FA_Tags.objects.get(tag_slug=t_tag_slug)
tag_id = tag.pk
except ObjectDoesNotExist:
# If the tag does not exist, create a new tag and generate the slug
tag = FA_Tags(tag=t_tag_name)
tag.tag_slug = t_tag_slug
tag_id = tag.pk
site_tag, created = Tags.objects.get_or_create(tag_slug=t_tag_slug)
site_tag.category.add(category)
site_tag.content_type=content_type
site_tag.object_id=tag_id
site_tag.save()
tag.save() # Save the tag (either new or existing)
tags.append(tag)
site_tags.append(site_tag)
return tags, site_tags
def import_from_furaffinity(self, data, json_file_path, delete):
category, created = Category.objects.get_or_create(name=data['category'])
category.save()
furaffinity_submission, created = FA_Submission.objects.get_or_create(submission_id=data["id"])
furaffinity_submission.media_url = data["url"]
furaffinity_submission.title = data["title"]
furaffinity_submission.description = data["description"]
furaffinity_submission.date = timezone.make_aware(datetime.strptime(data["date"], "%Y-%m-%d %H:%M:%S"))
file_path = json_file_path.removesuffix(".json")
# Handle author import
author, site_user = self.import_furaffinity_user(data, file_path, category, delete)
furaffinity_submission.artist = author
# Handle tag import
tags, site_tags = self.import_furaffinity_tags(data, category)
for tag in tags:
furaffinity_submission.tags.add(tag) # Add the tag to the submission
species, created = FA_Species.objects.get_or_create(species=data["species"])
furaffinity_submission.species = species
# Handle mature rating import
mature, created = FA_Mature.objects.get_or_create(mature=data["rating"])
furaffinity_submission.mature_rating = mature
furaffinity_submission.number_of_comments = data["comments"]
furaffinity_submission.views = data["views"]
gender, created = FA_Gender.objects.get_or_create(gender=data["gender"])
furaffinity_submission.gender = gender
furaffinity_submission.fa_theme = data["theme"]
furaffinity_submission.fa_category = data["fa_category"]
furaffinity_submission.gallery_type = data["subcategory"]
furaffinity_submission.file_extension = data["extension"]
furaffinity_submission.image_height = data["height"]
furaffinity_submission.image_width = data["width"]
# Handle file import
furaffinity_submission.files.add(self.import_file(file_path, Submission_File, delete))
# Handle metadata file import
furaffinity_submission.metadata.add(self.import_file(json_file_path, Metadata_Files, delete))
furaffinity_submission.save()
submission_hash = self.compute_string_hash(category.name + data["artist_url"] + str(data["id"]))
submission, created = Submissions.objects.get_or_create(submission_hash=submission_hash)
submission.category = category
submission.tags.add(*site_tags)
submission.author = site_user
if furaffinity_submission.mature_rating.mature != "General" and not None:
print("Mature")
submission.mature = True
else:
submission.mature = False
submission.date = timezone.make_aware(datetime.strptime(data['date'], "%Y-%m-%d %H:%M:%S"))
content_type = ContentType.objects.get_for_model(FA_Submission)
# Get the primary key of the twitter_submission instance
furaffinity_submission_id = furaffinity_submission.pk
# Create the SubmissionsLink instance
submission.content_type=content_type
submission.object_id=furaffinity_submission_id
submission.save()
self.delete_imported_file(json_file_path, delete)
self.delete_imported_file(file_path, delete)