Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create FHIR Extractor #2

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions dataownertools/clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import unicodedata

def name(name):
if name is None:
return None
ascii_name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore')
return ascii_name.strip().upper().decode('ascii')

def phone(phone):
if phone is None:
return None
return ''.join(filter(lambda x: x.isdigit(), phone.strip()))

def address(address):
if address is None:
return None
ascii_address = unicodedata.normalize('NFKD', address).encode('ascii', 'ignore')
return ascii_address.strip().upper().decode('ascii')

def zip(zip):
if zip is None:
return None
return zip.strip()

def email(email):
if email is None:
return None
ascii_email = unicodedata.normalize('NFKD', email).encode('ascii', 'ignore')
return ascii_email.strip().upper().decode('ascii')
26 changes: 26 additions & 0 deletions dataownertools/report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from collections import Counter

class Report:
def __init__(self, fields):
self.field_counters = {}
for f in fields:
self.field_counters[f] = Counter()

def validate(self, field_name, value):
if value is None:
self.field_counters[field_name]['NULL Value'] += 1
return
if not value.isascii():
self.field_counters[field_name]['Contains Non-ASCII Characters'] += 1
if not value.isprintable():
self.field_counters[field_name]['Contains Non-printable Characters'] += 1
if value.isspace():
self.field_counters[field_name]['Empty String'] += 1

def print(self):
for field, counter in self.field_counters.items():
print(field)
print('--------------------')
for issue, count in counter.items():
print("{}: {}".format(issue, count))
print('')
86 changes: 20 additions & 66 deletions extract.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,12 @@
import csv
import argparse
import unicodedata
from dataownertools import clean, report
from collections import Counter
from random import shuffle
from sqlalchemy import create_engine, MetaData, Table
from sqlalchemy.sql import select

def validate(report, field, value):
if value is None:
report[field]['NULL Value'] += 1
return
if not value.isascii():
report[field]['Contains Non-ASCII Characters'] += 1
if not value.isprintable():
report[field]['Contains Non-printable Characters'] += 1
if value.isspace():
report[field]['Empty String'] += 1

def clean_name(name):
if name is None:
return None
ascii_name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore')
return ascii_name.strip().upper().decode('ascii')

def clean_phone(phone):
if phone is None:
return None
return ''.join(filter(lambda x: x.isdigit(), phone.strip()))

def clean_address(address):
if address is None:
return None
ascii_address = unicodedata.normalize('NFKD', address).encode('ascii', 'ignore')
return ascii_address.strip().upper().decode('ascii')

def clean_zip(zip):
if zip is None:
return None
return zip.strip()

def clean_email(email):
if email is None:
return None
ascii_email = unicodedata.normalize('NFKD', email).encode('ascii', 'ignore')
return ascii_email.strip().upper().decode('ascii')

def case_insensitive_lookup(row, desired_key):
if row.has_key(desired_key):
return row[desired_key]
Expand All @@ -57,9 +19,7 @@ def case_insensitive_lookup(row, desired_key):
'household_street_address', 'household_zip', 'parent_given_name' , 'parent_family_name',
'parent_email']

report = {}
for h in header:
report[h] = Counter()
report = report.Report(header)

export_count = 0

Expand All @@ -81,34 +41,34 @@ def case_insensitive_lookup(row, desired_key):
for row in results:
output_row = [case_insensitive_lookup(row, 'patid')]
given_name = case_insensitive_lookup(row, 'given_name')
validate(report, 'given_name', given_name)
output_row.append(clean_name(given_name))
report.validate('given_name', given_name)
output_row.append(clean.name(given_name))
family_name = case_insensitive_lookup(row, 'family_name')
validate(report, 'family_name', family_name)
output_row.append(clean_name(family_name))
report.validate('family_name', family_name)
output_row.append(clean.name(family_name))
birth_date = case_insensitive_lookup(row, 'birth_date')
output_row.append(birth_date.isoformat())
sex = case_insensitive_lookup(row, 'sex')
validate(report, 'sex', sex)
report.validate('sex', sex)
output_row.append(sex.strip())
phone_number = case_insensitive_lookup(row, 'household_phone')
validate(report, 'phone_number', phone_number)
output_row.append(clean_phone(phone_number))
report.validate('phone_number', phone_number)
output_row.append(clean.phone(phone_number))
household_street_address = case_insensitive_lookup(row, 'household_street_address')
validate(report, 'household_street_address', household_street_address)
output_row.append(clean_address(household_street_address))
report.validate('household_street_address', household_street_address)
output_row.append(clean.address(household_street_address))
household_zip = case_insensitive_lookup(row, 'household_zip')
validate(report, 'household_zip', household_zip)
output_row.append(clean_zip(household_zip))
report.validate('household_zip', household_zip)
output_row.append(clean.zip(household_zip))
parent_given_name = case_insensitive_lookup(row, 'parent_given_name')
validate(report, 'parent_given_name', parent_given_name)
output_row.append(clean_name(parent_given_name))
report.validate('parent_given_name', parent_given_name)
output_row.append(clean.name(parent_given_name))
parent_family_name = case_insensitive_lookup(row, 'parent_family_name')
validate(report, 'parent_family_name', parent_family_name)
output_row.append(clean_name(parent_family_name))
report.validate('parent_family_name', parent_family_name)
output_row.append(clean.name(parent_family_name))
parent_email = case_insensitive_lookup(row, 'household_email')
validate(report, 'parent_email', parent_email)
output_row.append(clean_email(parent_email))
report.validate('parent_email', parent_email)
output_row.append(clean.email(parent_email))
output_rows.append(output_row)
export_count += 1

Expand All @@ -123,10 +83,4 @@ def case_insensitive_lookup(row, desired_key):
print('Total records exported: {}'.format(export_count))
print('')

for field, counter in report.items():
print(field)
print('--------------------')
for issue, count in counter.items():
print("{}: {}".format(issue, count))
print('')

report.print()
67 changes: 67 additions & 0 deletions fhirextract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import csv
import argparse
import unicodedata
import ndjson
from dataownertools import clean, report
from collections import Counter
from random import shuffle

header = ['record_id', 'given_name', 'family_name', 'DOB', 'sex', 'phone_number',
'household_street_address', 'household_zip', 'parent_given_name' , 'parent_family_name',
'parent_email']

report = report.Report(header)

export_count = 0

parser = argparse.ArgumentParser(description='Tool for extracting, validating and cleaning data for CODI PPRL')
parser.add_argument('--bulkfile', nargs=1, required=True, help='Path to bulk FHIR patient resources')
args = parser.parse_args()

bulkfile_path = args.bulkfile[0]

output_rows = []

with open(bulkfile_path) as f:
reader = ndjson.reader(f)

# TODO: null safe search when digging through the Patient resource
for patient in reader:
patient_row = []
record_id = patient['id']
patient_row.append(record_id)
given_name = patient['name'][0]['given'][0]
report.validate('given_name', given_name)
patient_row.append(clean.name(given_name))
family_name = patient['name'][0]['family']
report.validate('family_name', family_name)
patient_row.append(clean.name(family_name))
patient_row.append(patient['birthDate'])
sex = patient['gender']
report.validate('sex', sex)
patient_row.append(sex[0].upper())
phone_number = patient['telecom'][0]['value']
report.validate('phone_number', phone_number)
patient_row.append(clean.phone(phone_number))
household_street_address = patient['address'][0]['line'][0]
report.validate('household_street_address', household_street_address)
patient_row.append(clean.address(household_street_address))
household_zip = patient['address'][0].get('postalCode')
report.validate('household_zip', household_zip)
patient_row.append(clean.zip(household_zip))
patient_row.append("")
patient_row.append("")
patient_row.append("")
output_rows.append(patient_row)
export_count += 1

shuffle(output_rows)

with open('pii.csv', 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(header)
for output_row in output_rows:
writer.writerow(output_row)

print('Total records exported: {}'.format(export_count))
print('')
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ clkhash>=0.16.0
psycopg2>=2.8.3
anonlink-client>=0.1.4
ijson>=3.1.2
ndjson>=0.3.1