Merge pull request #45 from UAL-RE/Issue_27

Address Issue 27 - Selective processing and uploading of articles and collections mentioned in the command-line argument
UAL-RE · Jun 3, 2023 · 619cbbd · 619cbbd
2 parents 6132f51 + 40ab539
commit 619cbbd
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 11 deletions.
diff --git a/app.py b/app.py
@@ -139,7 +139,7 @@ def main():
     log.write_log_in_file('info',
                           "Fetching articles...",
                           True)
-    article_obj = Article(config_file_path)
+    article_obj = Article(config_file_path, args.ids)
     article_data = article_obj.get_articles()
     log.write_log_in_file('info',
                           f"Total articles fetched: {len(article_data)}.",
@@ -149,7 +149,7 @@ def main():
     log.write_log_in_file('info',
                           "Fetching collections...",
                           True)
-    collection_obj = Collection(config_file_path)
+    collection_obj = Collection(config_file_path, args.ids)
     collection_data = collection_obj.get_collections()
     log.write_log_in_file('info',
                           f"Total collections fetched: {len(collection_data)}.",

diff --git a/figshare/Article.py b/figshare/Article.py
@@ -5,6 +5,7 @@
 import time
 import requests
 import hashlib
+import re
 from Log import Log
 from Config import Config
 from figshare.Integration import Integration
@@ -18,8 +19,11 @@ class Article:
     """
     Class constructor.
     Defined required variables that will be used in whole class.
+
+    :param config: configuration
+    :param ids: a list of ids to process. If None or an empty list is passed, all will be processed
     """
-    def __init__(self, config):
+    def __init__(self, config, ids):
         self.config_obj = Config(config)
         figshare_config = self.config_obj.figshare_config()
         self.system_config = self.config_obj.system_config()
@@ -40,6 +44,8 @@ def __init__(self, config):
             self.curation_storage_location = self.curation_storage_location + "/"
         self.article_match_info = {}
         self.article_non_match_info = {}
+        self.input_articles_id = ids
+        self.matched_curation_folder_list = []
 
     """
     This function is sending requests to 'account/institution/articles api.
@@ -79,7 +85,15 @@ def get_articles(self):
                             self.logs.write_log_in_file("info",
                                                         f"Page {page} is empty.", True)
                             break
-                        article_data = self.article_loop(articles, page_size, page, article_data)
+
+                        if (self.input_articles_id):
+                            filtered_data = [item for item in articles if item['id'] in self.input_articles_id]
+                            filtered_json = json.dumps(filtered_data)
+                            filtered_articles = json.loads(filtered_json)
+                            article_data = self.article_loop(filtered_articles, page_size, page, article_data)
+                        else:
+                            article_data = self.article_loop(articles, page_size, page, article_data)
+
                         success = True
                     else:
                         retries = self.retries_if_error(
@@ -678,21 +692,44 @@ def find_matched_articles(self, articles):
                             article_data[version_data['id']].append(data)
                             no_matched += 1
                             self.article_match_info[i] = f"article {data['id']} {version_no} ----- {data['author_dir']}"
+                            if (self.input_articles_id):
+                                self.matched_curation_folder_list.append(data['author_dir'])
                         else:
                             self.article_non_match_info[i] = f"article {data['id']} {version_no}"
 
+        matched_articles = []
         if (self.article_match_info):
             self.logs.write_log_in_file('info', "Curation folder found for below articles", True)
+
             # log articles id, version and dir name if matched.
             for index in self.article_match_info:
                 self.logs.write_log_in_file('info', self.article_match_info[index], True)
 
+                matched_id = re.search(r'article\s(.*?)\sv0', self.article_match_info[index])
+                if matched_id:
+                    matched_article_id = matched_id.group(1).strip()
+                    matched_articles.append(matched_article_id)
+                else:
+                    self.logs.write_log_in_file('error', f"Unable to fetch matched article id - {self.article_match_info[index]}", True)
+
+        unmatched_articles = []
         if (self.article_non_match_info):
             self.logs.write_log_in_file('warning', "Curation folder not found for below articles", True)
+
             # log unmatched articles id, and version
             for index in self.article_non_match_info:
                 self.logs.write_log_in_file('info', self.article_non_match_info[index], True)
 
+                unmatched_id = re.search(r'article\s(.*?)\sv0', self.article_non_match_info[index])
+                if unmatched_id:
+                    unmatched_article_id = unmatched_id.group(1).strip()
+                    unmatched_articles.append(unmatched_article_id)
+                else:
+                    self.logs.write_log_in_file('error', f"Unable to fetch unmatched article id - {self.article_non_match_info[index]}", True)
+
+        self.logs.write_log_in_file("info", f"Total matched unique articles: {len(set(matched_articles))}.", True)
+        self.logs.write_log_in_file("info", f"Total unmatched unique articles: {len(set(unmatched_articles))}.", True)
+
         self.logs.write_log_in_file("info", f"Total matched article versions: {no_matched}.", True)
         self.logs.write_log_in_file("info", f"Total unmatched article versions: {len(self.article_non_match_info)}.", True)
 
@@ -748,7 +785,7 @@ def __final_process(self, check_files, copy_files, check_dir, version_data, fold
     """
     Called before articles processing.
     """
-    def __initial_process(self, total_file_size):
+    def __initial_process(self):
         # get curation directory path
         curation_storage_location = self.curation_storage_location
         # get preservation directory path
@@ -765,13 +802,27 @@ def __initial_process(self, total_file_size):
     Process all articles after fetching from API.
     """
     def process_articles(self, articles, total_file_size):
-        curation_storage_location = self.__initial_process(total_file_size)
+        curation_storage_location = self.__initial_process()
         self.logs.write_log_in_file("info", "Finding matched articles.", True)
         article_data = self.find_matched_articles(articles)
 
-        # calculate space for given path.
-        curation_folder_size = self.get_file_size_of_given_path(curation_storage_location)
+        # Calculate the size of the curation folder
+        # When article IDs are explicitly passed, curation folder size is calculated based on matched curation folders.
+        # Otherwise, it is calculated considering all curation folders.
+        if (self.matched_curation_folder_list):
+            curation_folder_size = 0
+            for folder in self.matched_curation_folder_list:
+                path = curation_storage_location + folder
+                curation_folder_size += self.get_file_size_of_given_path(path)
+        else:
+            curation_folder_size = self.get_file_size_of_given_path(curation_storage_location)
+
         required_space = curation_folder_size + self.total_all_articles_file_size
+
+        self.logs.write_log_in_file("info", f"Total size of aritcles to be processed: {self.total_all_articles_file_size} bytes", True)
+        self.logs.write_log_in_file("info", f"Total size of the curated folders for the matched articles: {curation_folder_size} bytes", True)
+        self.logs.write_log_in_file("info", f"Total space required: {required_space} bytes", True)
+
         # check required space after curation process, it will stop process if there isn't sufficient space.
         self.check_required_space(required_space)
 

diff --git a/figshare/Collection.py b/figshare/Collection.py
@@ -14,8 +14,11 @@ class Collection:
     """
     Class constructor.
     Defined variables that will be used in whole class
+
+    :param config: configuration
+    :param ids: list of ids to process. If None or an empty list is passed, all collections will be processed
     """
-    def __init__(self, config) -> None:
+    def __init__(self, config, ids):
         self.config_obj = Config(config)
         figshare_config = self.config_obj.figshare_config()
         self.system_config = self.config_obj.system_config()
@@ -26,10 +29,11 @@ def __init__(self, config) -> None:
         self.institution = int(figshare_config["institution"])
         self.logs = Log(config)
         self.errors = []
-        self.article_obj = Article(config)
+        self.article_obj = Article(config, ids)
         self.preservation_storage_location = self.system_config["preservation_storage_location"]
         if self.preservation_storage_location[-1] != "/":
             self.preservation_storage_location = self.preservation_storage_location + "/"
+        self.input_collection_ids = ids
 
     """
     API get request sent to '/collections'.
@@ -63,7 +67,14 @@ def get_collections(self):
                             self.logs.write_log_in_file("info", "Page of collections is empty.", True)
                             break
 
-                        collection_data = self.collections_loop(collections, page_size, page, collection_data)
+                        if (self.input_collection_ids):
+                            filtered_data = [item for item in collections if item['id'] in self.input_collection_ids]
+                            filtered_json = json.dumps(filtered_data)
+                            filtered_collections = json.loads(filtered_json)
+                            collection_data = self.collections_loop(filtered_collections, page_size, page, collection_data)
+                        else:
+                            collection_data = self.collections_loop(collections, page_size, page, collection_data)
+
                         success = True
                     else:
                         success = False