Data analyzed:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yaml
import seaborn as sns
from pandas.tseries import offsets
import matplotlib.ticker as mtick
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
AutoMinorLocator, NullLocator)
import matplotlib.dates as mdates
csv_opts = {'sep': '|',
'quotechar': '"',
'compression': 'gzip',
'encoding': 'utf-8'}
smu = pd.read_csv('input/smu.csv.gz',
parse_dates=['assigned_date', 'removed_date'],
**csv_opts)
assert len(set(smu['hashid'])) == len(smu)
assert sum(smu['hashid'].isnull()) == 0
csv_opts = {'sep': '|',
'quotechar': '"',
'compression': 'gzip',
'encoding': 'utf-8'}
rhu = pd.read_csv('input/rhu.csv.gz',
parse_dates=['date_in', 'date_out'],
**csv_opts)
assert len(set(rhu['hashid'])) == len(rhu)
assert sum(rhu['hashid'].isnull()) == 0
srms2 = pd.read_csv('input/srms-2.csv.gz',
parse_dates=['placement_date', 'release_date'],
**csv_opts)
assert len(set(srms2['hashid'])) == len(srms2)
assert sum(srms2['hashid'].isnull()) == 0
srms1 = pd.read_csv('input/srms-1.csv.gz',
parse_dates=['placement_date', 'release_date'],
**csv_opts)
assert len(set(srms1['hashid'])) == len(srms1)
assert sum(srms1['hashid'].isnull()) == 0
# Standardizing date field names to those used in SRMS
rhu = rhu.rename({'date_in': 'placement_date',
'date_out': 'release_date'}, axis=1)
smu = smu.rename({'assigned_date': 'placement_date',
'removed_date': 'release_date'}, axis=1)
A close review of these various datasets gives us an overview of solitary confinement practices at the NWDC, but also raises further questions about consistency of record-keeping and reporting. While the datasets are not directly comparable, and lack unique or consistent identifiers that would allow de-duplication across datasets; there are some characteristics we would expect if record-keeping and reporting was consistent:
While this is true for most years, we find that in 2018 the RHU dataset reports more total placements than SMU.
We find that the SMU and RHU datasets include significantly more stays longer than 14 days than those reported in the SRMS datasets. This might be explained if multiple placements of the same individual tracked as multiple entries in the GEO internal reports are reported to ICE SRMS as a single entry, especially in the case of the SMU dataset, which tracks placements by housing assignment.
While not conclusive, the data is suggestive of underreporting of long solitary placements in ICE SRMS records for NWDC.
# SRMS datasets for NWDC have no 0 length placements
# (Note national SRMS data released by POGO does include 0 length placements for other detention centers)
assert sum(srms1['placement_date'] == srms1['release_date']) == 0
assert sum(srms2['placement_date'] == srms2['release_date']) == 0
# GEO datasets have 0 length placements
assert sum(smu['placement_date'] == smu['release_date']) > 0
assert sum(rhu['placement_date'] == rhu['release_date']) > 0
rhu['dataset'] = 'RHU'
smu['dataset'] = 'SMU'
srms1['dataset'] = 'SRMS 1'
srms2['dataset'] = 'SRMS 2'
geo_datasets = [rhu, smu]
ice_datasets = [srms1, srms2]
datasets = geo_datasets + ice_datasets
# Calculating first-day-exclusive placement lengths for each dataset
for d in datasets:
d['days_solitary'] = (d['release_date'] - d['placement_date']) / np.timedelta64(1, 'D')
cols = ['dataset', 'placement_date', 'release_date', 'days_solitary', 'hashid']
df = pd.concat([smu[cols], rhu[cols], srms1[cols], srms2[cols]], axis=0)
df = df[pd.notnull(df['release_date'])]
df['long_stay'] = df['days_solitary'] > 14
Below we report basic descriptive statistics for each dataset.
table = pd.DataFrame()
table['total'] = df.groupby(['dataset'])['placement_date'].count()
table['min_date'] = df.groupby(['dataset'])['placement_date'].min()
table['max_date'] = df.groupby(['dataset'])['placement_date'].max()
table['avg_days'] = df.groupby(['dataset'])['days_solitary'].mean()
table['med_days'] = df.groupby(['dataset'])['days_solitary'].median()
table['min_days'] = df.groupby(['dataset'])['days_solitary'].min()
table['max_days'] = df.groupby(['dataset'])['days_solitary'].max()
table['total_long'] = df.groupby(['dataset'])['long_stay'].sum()
table.to_csv('output/dataset_description.csv')
overlap_table = table.loc[['RHU', 'SMU', 'SRMS 2']]
Table 1: Dataset overview: Total records, date range, descriptive statistics for days_solitary
and total placements >14 days
total | min_date | max_date | avg_days | med_days | min_days | max_days | total_long | |
---|---|---|---|---|---|---|---|---|
dataset | ||||||||
RHU | 2457 | 2015-01-03 | 2020-05-28 | 14.077737 | 4.0 | 0.0 | 693.0 | 480.0 |
SMU | 3433 | 2013-06-27 | 2020-03-31 | 9.976697 | 3.0 | 0.0 | 488.0 | 598.0 |
SRMS 1 | 357 | 2013-05-13 | 2018-05-14 | 59.605042 | 30.0 | 1.0 | 781.0 | 296.0 |
SRMS 2 | 453 | 2013-09-03 | 2020-03-16 | 59.410596 | 31.0 | 1.0 | 691.0 | 387.0 |
Note that the below figure covers the entire time period covered by all of the datasets, for placement_date
values ranging May 13, 2013 to May 28, 2020; none of the datasets report complete information for the entire range.
Figure 1: Total records per calendar year
g = df.set_index('placement_date').groupby([pd.Grouper(freq='AS'), 'dataset'])
data = g['hashid'].nunique().unstack()
data.index = data.index.year
fig, ax = plt.subplots()
data.plot(kind='bar', ax=ax, figsize=(10,6))
ax.set_facecolor('#DDDDDD')
ax.set_axisbelow(True)
ax.yaxis.grid(color='#FFFFFF')
ax.yaxis.set_minor_locator(MultipleLocator(50))
ax.yaxis.set_major_locator(MultipleLocator(100))
plt.xticks(rotation=0)
plt.title('Count of total records', fontsize=18, pad=10)
plt.ylabel('count', fontsize=12)
plt.xlabel('placement_date', fontsize=12)
txt='Data Source: NWDC segregation datasets released via FOIA\nFigure: UW Center for Human Rights'
plt.figtext(1, -.03, txt, wrap=True, horizontalalignment='right', fontsize=12)
plt.show();
Figure 2: Mean placement length by calendar year
g = df.set_index('placement_date').groupby([pd.Grouper(freq='AS'), 'dataset'])
data = g['days_solitary'].mean().unstack()
data.index = data.index.year
fig, ax = plt.subplots()
data.plot(kind='bar', ax=ax, figsize=(10,6))
ax.set_facecolor('#DDDDDD')
ax.set_axisbelow(True)
ax.yaxis.grid(color='#FFFFFF')
ax.yaxis.set_minor_locator(MultipleLocator(5))
ax.yaxis.set_major_locator(MultipleLocator(15))
plt.xticks(rotation=0)
plt.title('Annual mean `days_solitary`', fontsize=18, pad=10)
plt.ylabel('`days_solitary`', fontsize=12)
plt.xlabel('placement_date', fontsize=12)
txt='Data Source: NWDC segregation datasets released via FOIA\nFigure: UW Center for Human Rights'
plt.figtext(1, -.03, txt, wrap=True, horizontalalignment='right', fontsize=12)
plt.show();
The distribution of placements by length (focusing on placements of 30 days or less) shows the differences between the various datasets. The GEO Group datasets (SMU, RHU) include more short placements, while the ICE SRMS datasets include more long placements, with concentrations of placements at 14 days, 21 days, and 30 days, corresponding with ICE reporting requirements. ICE SRMS datasets report no placements less than one day long (0 days). Of the GEO Group datasets, for SMU 0 is the most frequent stay length in this sample; for the RHU dataset, 1 is the most frequent. File "days_solitary
as 0 for placements starting and ending on the same day; as discussed in Data Appendix 1. GEO Group Internal Datasets ("SMU", "RHU"), the original RHU dataset displays a first-day inclusive stay length calculation (where stays starting and ending on the same day == 1); while the original SMU dataset displays a first-day exclusive stay length calculation (where stays starting and ending on the same day == 0), and includes hourly placement and release datetime values. It is possible that this calculation underestimates placement lengths for the RHU dataset, in favor of consistency with the other datasets.")
^
SyntaxError: invalid syntax
Figure 3: Distribution of records for days_solitary
<= 30
data = df[df['days_solitary'] <= 30]
x1 = list( data.loc[data['dataset'] == 'RHU',"days_solitary"])
x2 = list( data.loc[data['dataset'] == 'SMU',"days_solitary"])
x3 = list( data.loc[data['dataset'] == 'SRMS 1',"days_solitary"])
x4 = list( data.loc[data['dataset'] == 'SRMS 2',"days_solitary"])
names = ['RHU', 'SMU', 'SRMS 1', 'SRMS 2']
plt.hist([x1, x2, x3, x4], bins = int(30/1), density=True, label=names)
plt.legend()
plt.xlabel('days_solitary')
plt.ylabel('normalized_records')
plt.title('Distribution of `days_solitary` <= 30')
plt.show();
Below we restict our analysis to SMU, RHU, and SRMS 2 datasets, for which complete data exists for placement_date
values ranging from January 3, 2015 to March 16, 2020.
In the following figure we visualize the number of stays longer than 14 days in each dataset, which are required to be reported via the ICE SRMS. Both of the GEO Group datasets, RHU and SMU, consistently record more long stays than are reported via the ICE SRMS. In the case of the SMU, which records placements by housing assignment, this may be explained by long stays broken up into multiple shorter housing assignments of 14 days or more (note Figure 2 above showing that annual average placement lengths for SMU are always below 14 days).
Figure 4: Total long stays (>14 days) per calendar year
data = df[df['dataset'] != 'SRMS 1']
data = data.set_index('placement_date').loc['2015-01-03': '2020-03-16']
g = data.groupby([pd.Grouper(freq='AS'), 'dataset'])
data = g['long_stay'].sum().unstack()
data.index = data.index.year
fig, ax = plt.subplots()
data.plot(kind='bar', ax=ax, figsize=(10,6))
ax.set_facecolor('#DDDDDD')
ax.set_axisbelow(True)
ax.yaxis.grid(color='#FFFFFF')
#ax.yaxis.set_minor_locator(MultipleLocator(50))
#ax.yaxis.set_major_locator(MultipleLocator(100))
plt.xticks(rotation=0)
plt.title('Count of long stays (>14 days)', fontsize=18, pad=10)
plt.ylabel('count', fontsize=12)
plt.xlabel('placement_date', fontsize=12)
txt='Data Source: NWDC segregation datasets released via FOIA\nFigure: UW Center for Human Rights'
plt.figtext(1, -.03, txt, wrap=True, horizontalalignment='right', fontsize=12)
plt.show();
Counting total days of solitary placements associated with long stays (>14 days) per calendar year, the relationship is less consistent. During most years, either SMU or RHU reflect more total days associated with long stays than SRMS; except in the case of 2018, when the SRMS reflects slightly more total days associated with long stays than the RHU dataset, and significantly more than the SMU dataset. Again, note that the SMU dataset's total days associated with long stays might differ from the other data sets due to the fact that it reflects housing placements, rather than total length of stay.
Figure 5: Total days for long stays (>14 days) per calendar year
data = df[df['dataset'] != 'SRMS 1']
data = data[data['long_stay'] == True]
data = data.set_index('placement_date').loc['2015-01-03': '2020-03-16']
g = data.groupby([pd.Grouper(freq='AS'), 'dataset'])
data = g['days_solitary'].sum().unstack()
data.index = data.index.year
fig, ax = plt.subplots()
data.plot(kind='bar', ax=ax, figsize=(10,6))
ax.set_facecolor('#DDDDDD')
ax.set_axisbelow(True)
ax.yaxis.grid(color='#FFFFFF')
#ax.yaxis.set_minor_locator(MultipleLocator(50))
#ax.yaxis.set_major_locator(MultipleLocator(100))
plt.xticks(rotation=0)
plt.title('Sum of `days_solitary` for long stays (>14 days)', fontsize=18, pad=10)
plt.ylabel('`days_solitary`', fontsize=12)
plt.xlabel('placement_date', fontsize=12)
txt='Data Source: NWDC segregation datasets released via FOIA\nFigure: UW Center for Human Rights'
plt.figtext(1, -.03, txt, wrap=True, horizontalalignment='right', fontsize=12)
plt.show();
Looking at all placements including short placements under 14 days, RHU and SMU usually report more total days than reported to SRMS, which would be expected given ICE reporting guidelines. However, during 2018 the SMU dataset records fewer total days than those reported to SRMS, which is suggestive of data collection inconsistencies:
Figure 6: Total days for all solitary stays per calendar year
data = df[df['dataset'] != 'SRMS 1']
data = data.set_index('placement_date').loc['2015-01-03': '2020-03-16']
g = data.groupby([pd.Grouper(freq='AS'), 'dataset'])
data = g['days_solitary'].sum().unstack()
data.index = data.index.year
fig, ax = plt.subplots()
data.plot(kind='bar', ax=ax, figsize=(10,6))
ax.set_facecolor('#DDDDDD')
ax.set_axisbelow(True)
ax.yaxis.grid(color='#FFFFFF')
#ax.yaxis.set_minor_locator(MultipleLocator(50))
#ax.yaxis.set_major_locator(MultipleLocator(100))
plt.xticks(rotation=0)
plt.title('Sum of `days_solitary` for all stays', fontsize=18, pad=10)
plt.ylabel('`days_solitary`', fontsize=12)
plt.xlabel('placement_date', fontsize=12)
txt='Data Source: NWDC segregation datasets released via FOIA\nFigure: UW Center for Human Rights'
plt.figtext(1, -.03, txt, wrap=True, horizontalalignment='right', fontsize=12)
plt.show();
Next section: Data Appendix 3. ICE SRMS Data for NWDC, 2nd Installment Aug. 2020
[1] US DOJ attorneys for ICE specified that the terms "Special Management Unit" and "Restricted Housing Unit" are interchangeable and identify the same locations.