-
Notifications
You must be signed in to change notification settings - Fork 5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Updates to read CSV method #1512
base: master
Are you sure you want to change the base?
Changes from all commits
7352c14
4f96c65
8c61da4
375cf75
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
import numpy as np | ||
import pandas as pd | ||
from pandas import DataFrame, DateOffset | ||
from pandas._typing import DtypeArg | ||
|
||
from tlo import Population, Property, Types | ||
|
||
|
@@ -474,7 +475,7 @@ def convert_excel_files_to_csv(folder: Path, files: Optional[list[str]] = None, | |
Path(folder/excel_file_path).unlink() | ||
|
||
|
||
def read_csv_files(folder: Path, files: Optional[list[str]] = None) -> DataFrame | dict[str, DataFrame]: | ||
def read_csv_files(folder: Path, dtype: DtypeArg | None = None, files: Optional[list[str]] | None | int = 0) -> DataFrame | dict[str, DataFrame]: | ||
""" | ||
A function to read CSV files in a similar way pandas reads Excel files (:py:func:`pandas.read_excel`). | ||
|
||
|
@@ -484,6 +485,7 @@ def read_csv_files(folder: Path, files: Optional[list[str]] = None) -> DataFrame | |
:py:func:`pandas.drop`. | ||
|
||
:param folder: Path to folder containing CSV files to read. | ||
:param dtype: preferred datatype | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually it seems like you do use explicitly use this ability to pass |
||
:param files: preferred csv file name(s). This is the same as sheet names in Excel file. Note that if None(no files | ||
selected) then all files in the containing folder will be loaded | ||
|
||
|
@@ -498,15 +500,15 @@ def clean_dataframe(dataframes_dict: dict[str, DataFrame]) -> None: | |
for _key, dataframe in dataframes_dict.items(): | ||
all_data[_key] = dataframe.drop(dataframe.filter(like='Unnamed'), axis=1) # filter and drop Unnamed columns | ||
|
||
if files is None: | ||
if files == 0 or files is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the reasoning for allowing both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @matt-graham , declaring an explicit flag |
||
for f_name in folder.rglob("*.csv"): | ||
all_data[f_name.stem] = pd.read_csv(f_name) | ||
all_data[f_name.stem] = pd.read_csv(f_name, dtype=dtype) | ||
|
||
else: | ||
for f_name in files: | ||
all_data[f_name] = pd.read_csv((folder / f_name).with_suffix(".csv")) | ||
all_data[f_name] = pd.read_csv((folder / f_name).with_suffix(".csv"), dtype=dtype) | ||
# clean and return the dataframe dictionary | ||
clean_dataframe(all_data) | ||
# If only one file loaded return dataframe directly rather than dict | ||
return next(iter(all_data.values())) if len(all_data) == 1 else all_data | ||
return next(iter(all_data.values())) if len(all_data) == 1 and files is not None else all_data | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -332,8 +332,44 @@ def copy_files_to_temporal_directory_and_return_path(tmpdir): | |||||
return tmpdir_resource_filepath | ||||||
|
||||||
|
||||||
def test_read_csv_method_with_no_file(tmpdir): | ||||||
""" read csv method when no file name is supplied | ||||||
def test_pass_datatypes_to_read_csv_method(tmpdir): | ||||||
""" test passing column datatypes to read csv method. Final column datatype should change to what has been passed """ | ||||||
# copy and get resource files path in the temporal directory | ||||||
tmpdir_resource_filepath = copy_files_to_temporal_directory_and_return_path(tmpdir) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As we don't seem to use the files copied to the temporary directory in |
||||||
sample_data = pd.DataFrame(data={'numbers1': [5,6,8,4,9,6], 'numbers2': [19,27,53,49,75,56]}, dtype=int) | ||||||
sample_data.to_csv(tmpdir_resource_filepath/'sample_data.csv', index=False) | ||||||
# read from the sample data file | ||||||
read_sample_data = read_csv_files(tmpdir_resource_filepath, files=['sample_data']) | ||||||
# confirm column datatype is what was assigned | ||||||
assert read_sample_data.numbers1.dtype and read_sample_data.numbers2.dtype == 'int' | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
The current condition is equivalent to
that is check if |
||||||
# define new datatypes | ||||||
datatype = {'numbers1': int, 'numbers2': float} | ||||||
# pass the new datatypes to read csv method and confirm datatype has changed to what has been declared now | ||||||
assign_dtype = read_csv_files(tmpdir_resource_filepath, files=['sample_data'], dtype=datatype) | ||||||
assert assign_dtype.numbers1.dtype == 'int' and assign_dtype.numbers2.dtype == 'float' | ||||||
|
||||||
|
||||||
def test_read_csv_file_method_passing_none_to_files_argument(tmpdir): | ||||||
""" test reading csv files with one file in the target resource file and setting to None the files argument | ||||||
|
||||||
Expectations | ||||||
1. should return a dictionary | ||||||
2. the dictionary key name should match file name | ||||||
""" | ||||||
# copy and get resource files path in the temporal directory | ||||||
tmpdir_resource_filepath = copy_files_to_temporal_directory_and_return_path(tmpdir) | ||||||
# choose an Excel file with one sheet in it and convert it to csv file | ||||||
convert_excel_files_to_csv(tmpdir_resource_filepath, files=['ResourceFile_load-parameters.xlsx']) | ||||||
# get the folder containing the newly converted csv file and check the expected behavior | ||||||
this_csv_resource_folder = tmpdir_resource_filepath/"ResourceFile_load-parameters" | ||||||
file_names = [csv_file_path.stem for csv_file_path in this_csv_resource_folder.rglob("*.csv")] | ||||||
one_csv_file_in_folder_dict = read_csv_files(this_csv_resource_folder, files=None) | ||||||
assert isinstance(one_csv_file_in_folder_dict, dict) | ||||||
assert set(one_csv_file_in_folder_dict.keys()) == set(file_names) | ||||||
|
||||||
|
||||||
def test_read_csv_method_with_default_value_for_files_argument(tmpdir): | ||||||
""" read csv method when no file name(s) is supplied to the files argument | ||||||
i) should return dictionary. | ||||||
ii) dictionary keys should match csv file names in resource folder | ||||||
iii) all dictionary values should be dataframes | ||||||
|
@@ -350,7 +386,7 @@ def test_read_csv_method_with_no_file(tmpdir): | |||||
|
||||||
|
||||||
def test_read_csv_method_with_one_file(tmpdir): | ||||||
""" test read csv method when one file name is supplied. should return a dataframe | ||||||
""" test read csv method when one file name is supplied to files argument. should return a dataframe | ||||||
:param tmpdir: path to a temporal directory | ||||||
|
||||||
""" | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Optional[T]
is equivalent toT | None
so is redundant here.