-
Notifications
You must be signed in to change notification settings - Fork 6
/
create_labels.py
108 lines (84 loc) · 3.89 KB
/
create_labels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""Creating Labels
Script for running steps in 1_Machine_Learning_Labels.ipynb
"""
# Setup
import pandas as pd
import sqlite3
from dateutil.parser import parse
# Define create_labels function
def create_labels(features_end, prediction_start, prediction_end, conn, output = False):
"""
Generate a list of labels and return the table as a dataframe.
Parameters
----------
features_end
prediction_start
prediction_end
conn: obj
Returns
-------
df_labels: DataFrame
"""
end_x_year = parse(features_end, fuzzy = True).year
start_y_year = parse(prediction_start, fuzzy = True).year
end_y_year = parse(prediction_end, fuzzy = True).year
sql_script="""
drop table if exists sentences_prep;
create table sentences_prep as
select inmate_doc_number,
cast(inmate_sentence_component as integer) as sentence_component,
date([sentence_begin_date_(for_max)]) as sentence_begin_date,
date(actual_sentence_end_date) as sentence_end_date
from sentences;
drop table if exists release_dates_2000_{end_x_year};
create temp table release_dates_2000_{end_x_year} as
select inmate_doc_number, sentence_end_date
from sentences_prep
where sentence_end_date >= '2000-01-01' and sentence_end_date <= '{features_end}';
drop table if exists last_exit_2000_{end_x_year};
create temp table last_exit_2000_{end_x_year} as
select inmate_doc_number, max(sentence_end_date) sentence_end_date
from release_dates_2000_{end_x_year}
group by inmate_doc_number;
drop table if exists admit_{start_y_year}_{end_y_year};
create temp table admit_{start_y_year}_{end_y_year} as
select inmate_doc_number, sentence_component, sentence_begin_date
from sentences_prep
where sentence_begin_date >= '{prediction_start}' and sentence_begin_date <= '{prediction_end}' and sentence_component = 1;
drop table if exists recidivism_{start_y_year}_{end_y_year};
create temp table recidivism_{start_y_year}_{end_y_year} as
select r.inmate_doc_number, r.sentence_end_date, a.sentence_begin_date,
case when a.sentence_begin_date is null then 0 else 1 end recidivism
from last_exit_2000_{end_x_year} r
left join admit_{start_y_year}_{end_y_year} a on r.inmate_doc_number = a.inmate_doc_number;
drop table if exists recidivism_labels_{start_y_year}_{end_y_year};
create table recidivism_labels_{start_y_year}_{end_y_year} as
select distinct inmate_doc_number, recidivism
from recidivism_{start_y_year}_{end_y_year};
""".format(features_end = features_end,
prediction_start = prediction_start,
prediction_end = prediction_end,
end_x_year = end_x_year,
start_y_year = start_y_year,
end_y_year = end_y_year)
cur = conn.cursor()
try:
cur.executescript(sql_script)
cur.close()
conn.commit()
except:
conn.rollback()
if output:
df_label = pd.read_sql('select * from recidivism_labels_{start_y_year}_{end_y_year}'.format(
start_y_year = start_y_year,
end_y_year = end_y_year), conn)
return df_label
else:
print('Labels table ({pred_start} to {pred_end}) created. Use output = True to get a DataFrame as the output.'.format(pred_start = prediction_start,
pred_end = prediction_end))
# Run create_labels function
if __name__ == '__main__':
DB = 'ncdoc.db'
conn = sqlite3.connect(DB)
create_labels('2008-12-31', '2009-01-01', '2013-12-31', conn)
create_labels('2013-12-31', '2014-01-01', '2018-12-31', conn)