Skip to content

Commit

Permalink
Merge pull request #45 from UAL-RE/Issue_27
Browse files Browse the repository at this point in the history
Address Issue 27 - Selective processing and uploading of articles and collections mentioned in the command-line argument
  • Loading branch information
zoidy authored Jun 3, 2023
2 parents 6132f51 + 40ab539 commit 619cbbd
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 11 deletions.
4 changes: 2 additions & 2 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def main():
log.write_log_in_file('info',
"Fetching articles...",
True)
article_obj = Article(config_file_path)
article_obj = Article(config_file_path, args.ids)
article_data = article_obj.get_articles()
log.write_log_in_file('info',
f"Total articles fetched: {len(article_data)}.",
Expand All @@ -149,7 +149,7 @@ def main():
log.write_log_in_file('info',
"Fetching collections...",
True)
collection_obj = Collection(config_file_path)
collection_obj = Collection(config_file_path, args.ids)
collection_data = collection_obj.get_collections()
log.write_log_in_file('info',
f"Total collections fetched: {len(collection_data)}.",
Expand Down
63 changes: 57 additions & 6 deletions figshare/Article.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import time
import requests
import hashlib
import re
from Log import Log
from Config import Config
from figshare.Integration import Integration
Expand All @@ -18,8 +19,11 @@ class Article:
"""
Class constructor.
Defined required variables that will be used in whole class.
:param config: configuration
:param ids: a list of ids to process. If None or an empty list is passed, all will be processed
"""
def __init__(self, config):
def __init__(self, config, ids):
self.config_obj = Config(config)
figshare_config = self.config_obj.figshare_config()
self.system_config = self.config_obj.system_config()
Expand All @@ -40,6 +44,8 @@ def __init__(self, config):
self.curation_storage_location = self.curation_storage_location + "/"
self.article_match_info = {}
self.article_non_match_info = {}
self.input_articles_id = ids
self.matched_curation_folder_list = []

"""
This function is sending requests to 'account/institution/articles api.
Expand Down Expand Up @@ -79,7 +85,15 @@ def get_articles(self):
self.logs.write_log_in_file("info",
f"Page {page} is empty.", True)
break
article_data = self.article_loop(articles, page_size, page, article_data)

if (self.input_articles_id):
filtered_data = [item for item in articles if item['id'] in self.input_articles_id]
filtered_json = json.dumps(filtered_data)
filtered_articles = json.loads(filtered_json)
article_data = self.article_loop(filtered_articles, page_size, page, article_data)
else:
article_data = self.article_loop(articles, page_size, page, article_data)

success = True
else:
retries = self.retries_if_error(
Expand Down Expand Up @@ -678,21 +692,44 @@ def find_matched_articles(self, articles):
article_data[version_data['id']].append(data)
no_matched += 1
self.article_match_info[i] = f"article {data['id']} {version_no} ----- {data['author_dir']}"
if (self.input_articles_id):
self.matched_curation_folder_list.append(data['author_dir'])
else:
self.article_non_match_info[i] = f"article {data['id']} {version_no}"

matched_articles = []
if (self.article_match_info):
self.logs.write_log_in_file('info', "Curation folder found for below articles", True)

# log articles id, version and dir name if matched.
for index in self.article_match_info:
self.logs.write_log_in_file('info', self.article_match_info[index], True)

matched_id = re.search(r'article\s(.*?)\sv0', self.article_match_info[index])
if matched_id:
matched_article_id = matched_id.group(1).strip()
matched_articles.append(matched_article_id)
else:
self.logs.write_log_in_file('error', f"Unable to fetch matched article id - {self.article_match_info[index]}", True)

unmatched_articles = []
if (self.article_non_match_info):
self.logs.write_log_in_file('warning', "Curation folder not found for below articles", True)

# log unmatched articles id, and version
for index in self.article_non_match_info:
self.logs.write_log_in_file('info', self.article_non_match_info[index], True)

unmatched_id = re.search(r'article\s(.*?)\sv0', self.article_non_match_info[index])
if unmatched_id:
unmatched_article_id = unmatched_id.group(1).strip()
unmatched_articles.append(unmatched_article_id)
else:
self.logs.write_log_in_file('error', f"Unable to fetch unmatched article id - {self.article_non_match_info[index]}", True)

self.logs.write_log_in_file("info", f"Total matched unique articles: {len(set(matched_articles))}.", True)
self.logs.write_log_in_file("info", f"Total unmatched unique articles: {len(set(unmatched_articles))}.", True)

self.logs.write_log_in_file("info", f"Total matched article versions: {no_matched}.", True)
self.logs.write_log_in_file("info", f"Total unmatched article versions: {len(self.article_non_match_info)}.", True)

Expand Down Expand Up @@ -748,7 +785,7 @@ def __final_process(self, check_files, copy_files, check_dir, version_data, fold
"""
Called before articles processing.
"""
def __initial_process(self, total_file_size):
def __initial_process(self):
# get curation directory path
curation_storage_location = self.curation_storage_location
# get preservation directory path
Expand All @@ -765,13 +802,27 @@ def __initial_process(self, total_file_size):
Process all articles after fetching from API.
"""
def process_articles(self, articles, total_file_size):
curation_storage_location = self.__initial_process(total_file_size)
curation_storage_location = self.__initial_process()
self.logs.write_log_in_file("info", "Finding matched articles.", True)
article_data = self.find_matched_articles(articles)

# calculate space for given path.
curation_folder_size = self.get_file_size_of_given_path(curation_storage_location)
# Calculate the size of the curation folder
# When article IDs are explicitly passed, curation folder size is calculated based on matched curation folders.
# Otherwise, it is calculated considering all curation folders.
if (self.matched_curation_folder_list):
curation_folder_size = 0
for folder in self.matched_curation_folder_list:
path = curation_storage_location + folder
curation_folder_size += self.get_file_size_of_given_path(path)
else:
curation_folder_size = self.get_file_size_of_given_path(curation_storage_location)

required_space = curation_folder_size + self.total_all_articles_file_size

self.logs.write_log_in_file("info", f"Total size of aritcles to be processed: {self.total_all_articles_file_size} bytes", True)
self.logs.write_log_in_file("info", f"Total size of the curated folders for the matched articles: {curation_folder_size} bytes", True)
self.logs.write_log_in_file("info", f"Total space required: {required_space} bytes", True)

# check required space after curation process, it will stop process if there isn't sufficient space.
self.check_required_space(required_space)

Expand Down
17 changes: 14 additions & 3 deletions figshare/Collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@ class Collection:
"""
Class constructor.
Defined variables that will be used in whole class
:param config: configuration
:param ids: list of ids to process. If None or an empty list is passed, all collections will be processed
"""
def __init__(self, config) -> None:
def __init__(self, config, ids):
self.config_obj = Config(config)
figshare_config = self.config_obj.figshare_config()
self.system_config = self.config_obj.system_config()
Expand All @@ -26,10 +29,11 @@ def __init__(self, config) -> None:
self.institution = int(figshare_config["institution"])
self.logs = Log(config)
self.errors = []
self.article_obj = Article(config)
self.article_obj = Article(config, ids)
self.preservation_storage_location = self.system_config["preservation_storage_location"]
if self.preservation_storage_location[-1] != "/":
self.preservation_storage_location = self.preservation_storage_location + "/"
self.input_collection_ids = ids

"""
API get request sent to '/collections'.
Expand Down Expand Up @@ -63,7 +67,14 @@ def get_collections(self):
self.logs.write_log_in_file("info", "Page of collections is empty.", True)
break

collection_data = self.collections_loop(collections, page_size, page, collection_data)
if (self.input_collection_ids):
filtered_data = [item for item in collections if item['id'] in self.input_collection_ids]
filtered_json = json.dumps(filtered_data)
filtered_collections = json.loads(filtered_json)
collection_data = self.collections_loop(filtered_collections, page_size, page, collection_data)
else:
collection_data = self.collections_loop(collections, page_size, page, collection_data)

success = True
else:
success = False
Expand Down

0 comments on commit 619cbbd

Please sign in to comment.