From 968371b893afe0c2b2b09e9248fba9f11b75cb3a Mon Sep 17 00:00:00 2001
From: Theodore Vasiloudis <thvasilo@amazon.com>
Date: Mon, 26 Feb 2024 20:00:57 +0200
Subject: [PATCH] [GSProcessing] Small doc fixes (#750)

*Issue #, if available:*

*Description of changes:*


By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.
---
 .../developer/input-configuration.rst             |  6 +++---
 docs/source/gs-processing/usage/example.rst       |  6 +++---
 .../graphstorm_processing/distributed_executor.py | 15 ++++++++++-----
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/docs/source/gs-processing/developer/input-configuration.rst b/docs/source/gs-processing/developer/input-configuration.rst
index 36b3ccf864..a9753a842b 100644
--- a/docs/source/gs-processing/developer/input-configuration.rst
+++ b/docs/source/gs-processing/developer/input-configuration.rst
@@ -286,7 +286,7 @@ can contain the following top-level keys:
    feature values in the data.
 -  ``transformation`` (JSON object, optional): The type of
    transformation that will be applied to the feature. For details on
-   the individual transformations supported see :ref:`supported-transformations`.
+   the individual transformations supported see :ref:`gsp-supported-transformations-ref`.
    If this key is missing, the feature is treated as
    a **no-op** feature without ``kwargs``.
 
@@ -294,7 +294,7 @@ can contain the following top-level keys:
       applied.
    -  ``kwargs`` (JSON object, optional): A dictionary of parameter
       names and values. Each individual transformation will have its own
-      supported parameters, described in :ref:`supported-transformations`.
+      supported parameters, described in :ref:`gsp-supported-transformations-ref`.
 
 -  ``name`` (String, optional): The name that will be given to the
    encoded feature. If not given, **column** is used as the output name.
@@ -470,7 +470,7 @@ arguments.
         You can find all models in the `Huggingface model repository <https://huggingface.co/models>`_.
       - ``max_seq_length`` (Integer, required): Specifies the maximum number of tokens of the input.
         You can use a length greater than the dataset's longest sentence; or for a safe value choose 128. Make sure to check
-        the model's max suported length when setting this value, 
+        the model's max suported length when setting this value,
 
 --------------
 
diff --git a/docs/source/gs-processing/usage/example.rst b/docs/source/gs-processing/usage/example.rst
index 43125d4d32..3f7feaa719 100644
--- a/docs/source/gs-processing/usage/example.rst
+++ b/docs/source/gs-processing/usage/example.rst
@@ -32,7 +32,7 @@ that contains the relevant data:
 Expected file inputs and configuration
 --------------------------------------
 
-GSProcessing expects the input files to be in specific format that will allow
+GSProcessing expects the input files to be in a specific format that will allow
 us to perform the processing and prepare the data for partitioning and training.
 
 The data files are expected to be:
@@ -40,9 +40,9 @@ The data files are expected to be:
 * Tabular data files. We support CSV-with-header format, or in Parquet format.
   The files can be split (multiple parts), or a single file.
 * Available on a local file system or on S3.
-* One tabular file source per edge and node type. For example, for a particular edge
+* One prefix per edge and node type. For example, for a particular edge
   type, all node identifiers (source, destination), features, and labels should
-  exist as columns in a single file source.
+  exist as columns in one or more files under a common prefix (local or on S3).
 
 Apart from the data, GSProcessing also requires a configuration file that describes the
 data and the transformations we will need to apply to the features and any encoding needed for
diff --git a/graphstorm-processing/graphstorm_processing/distributed_executor.py b/graphstorm-processing/graphstorm_processing/distributed_executor.py
index 791ef5d939..5b22ee2b8b 100644
--- a/graphstorm-processing/graphstorm_processing/distributed_executor.py
+++ b/graphstorm-processing/graphstorm_processing/distributed_executor.py
@@ -180,7 +180,7 @@ def __init__(
                     "graph"
                 ]
             else:
-                logging.warning("Unrecognized version name: %s", config_version)
+                logging.warning("Unrecognized configuration file version name: %s", config_version)
                 try:
                     converter = GConstructConfigConverter()
                     self.graph_config_dict = converter.convert_to_gsprocessing(dataset_config_dict)[
@@ -192,8 +192,10 @@ def __init__(
                         "graph" in dataset_config_dict
                     ), "Top-level element 'graph' needs to exist in a GSProcessing config"
                     self.graph_config_dict = dataset_config_dict["graph"]
+                    logging.info("Parsed config file as GSProcessing config")
         else:
             # Older versions of GConstruct configs might be missing a version entry
+            logging.warning("No configuration file version name, trying to parse as GConstruct...")
             converter = GConstructConfigConverter()
             self.graph_config_dict = converter.convert_to_gsprocessing(dataset_config_dict)["graph"]
 
@@ -263,7 +265,7 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--config-filename",
         type=str,
-        help="GSProcessing data configuration filename.",
+        help="GConstruct or GSProcessing data configuration filename.",
         required=True,
     )
     parser.add_argument(
@@ -309,9 +311,12 @@ def main():
     is_sagemaker_execution = os.path.exists("/opt/ml/config/processingjobconfig.json")
 
     if gsprocessing_args.input_prefix.startswith("s3://"):
-        assert gsprocessing_args.output_prefix.startswith(
-            "s3://"
-        ), "When providing S3 input and output prefixes, they must both be S3."
+        assert gsprocessing_args.output_prefix.startswith("s3://"), (
+            "When providing S3 input and output prefixes, they must both be S3 URIs, got: "
+            f"input: '{gsprocessing_args.input_prefix}' "
+            f"and output: '{gsprocessing_args.output_prefix}'."
+        )
+
         filesystem_type = "s3"
     else:
         # Ensure input and output prefixes exist and convert to absolute paths