#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# DISTRIBUTION STATEMENT A: Approved for public release. Distribution is
# unlimited.
# ----------------------------------------------------------------------------
"""Load supported data files."""
import datetime as dt
import glob
import numpy as np
import os
import pandas as pd
import cdflib
from netCDF4 import Dataset
from pyValEIA import logger
from pyValEIA.eia.types import clean_type
from pyValEIA.io.download import download_and_unzip_swarm
from pyValEIA.io.write import build_daily_stats_filename
from pyValEIA.stats import skill_score
from pyValEIA.utils import coords
[docs]
def load_cdf_data(file_path, variable_names):
"""Load a CDF file.
Parameters
----------
file_path : str
file path
variable_names : array-like
variable names for file to be extracted
Returns
-------
var_dict : dict
CDF file data with desired variables extracted
cdf_data : cdflib.cdfread.CDF
Loaded CDF data from `cdflib`
See Also
--------
cdflib.CDF
"""
# Load the desired file
cdf_data = cdflib.CDF(file_path)
# Extract the desired variable names
var_dict = dict()
for var in variable_names:
try:
var_dict[var] = cdf_data.varget(var)
except ValueError:
logger.warning('Unknown variable {:} in {:}'.format(var, file_path))
return var_dict, cdf_data
[docs]
def load_swarm(start_date, end_date, sat_id, file_dir, instrument='EFI',
dataset='LP', f_end='0602'):
"""Load Swarm data, downloading any missing files.
Parameters
----------
start_date : dt.datetime
Starting time
end_date : dt.datetime
Ending time
sat_id : str
Swarm satellite ID, one of 'A', 'B', or 'C'
file_dir : str
File directory where the instrument directory is located. Files will be
located in a directory tree specified by `download_and_unzip_swarm`
instrument : str
Swarm instrument (default='EFI')
dataset : str
Desired dataset acronym from instrument, e.g. 'LP' is Langmuir Probe
(default='LP')
f_end : str
For different data products there are different numbers at the end
The most common for EFIxLP is '0602' where '0602' represents
the file version. Other datasets may also have a string that represents
the record type (default='0602')
Returns
-------
swarm_data : pd.DataFrame
DataFrame of Swarm data for the desired instrument and satellite
Raises
------
ValueError
If an unknown dataset is requested (currently only supports 'LP')
"""
# Test the input after assigning variables where first variable is the
# time stamp, the second variable is geodetic latitude, and the third
# variable is geographic longitude
variables = {'LP': ["Timestamp", "Latitude", "Longitude", "Ne", "Ne_error",
"Te", "Te_error", "Flags_Ne", "Flags_Te", "Flags_LP",
"Radius"]}
if dataset not in variables.keys():
raise ValueError('unknown Swarm dataset.')
time_var = variables[dataset][0]
lat_var = variables[dataset][1]
lon_var = variables[dataset][2]
# Set variables to be renamed
rename = {'LP': {'Flags_Ne': 'Ne_flag', 'Flags_Te': 'Te_flag',
'Flags_LP': 'LP_flag', 'Radius': 'Altitude'}}
# Initalize the output
swarm_data = pd.DataFrame()
# Set the base directory
base_path = os.path.join(file_dir, instrument,
'Sat_{:s}'.format(sat_id.upper()))
# Cycle through the requested times
file_date = dt.datetime(start_date.year, start_date.month, start_date.day)
while file_date < end_date:
search_pattern = os.path.join(base_path, file_date.strftime('%Y'),
file_date.strftime('%Y%m%d'), "*.cdf")
# Find the desired file
filename = glob.glob(search_pattern)
if len(filename) > 0:
if len(filename) > 1:
logger.warning(''.join([
'found multiple Swarm', sat_id, ' ', instrument,
' files on ', file_date.strftime('%d-%b-%Y'),
' disgarding: {:}'.format(filename[1:])]))
filename = filename[0] # There should only be one file per day
else:
# Download the missing file
download_and_unzip_swarm(file_date, sat_id, file_dir,
instrument=instrument, dataset=dataset,
f_end=f_end)
# Get the downloaded file
filename = glob.glob(search_pattern)
if len(filename) > 0:
filename = filename[0]
else:
logger.warning(''.join(['unable to obtain Swarm', sat_id,
' ', instrument, ' file on ',
file_date.strftime('%d-%b-%Y')]))
filename = ''
# Load data if the file exists
if os.path.isfile(filename):
# Load all the available, desired data but the time
data, cdf_data = load_cdf_data(filename, variables[dataset][1:])
# Load the time as an array of datetime objects
data['Time'] = extract_cdf_time(cdf_data, time_var=time_var)
# Get the additional coordinates
data['Mag_Lat'], data['Mag_Lon'] = coords.compute_magnetic_coords(
data[lat_var], data[lon_var], data['Time'][0])
data['LT'] = coords.longitude_to_local_time(data[lon_var],
data['Time'])
# Convert altitude to km from meters
data['Radius'] -= coords.earth_radius(data[lat_var])
data['Radius'] = data['Radius'] / 1000
if swarm_data.empty:
swarm_data = pd.DataFrame(data)
else:
swarm_data = pd.concat([swarm_data, pd.DataFrame(data)])
# Cycle to the next day
file_date += dt.timedelta(days=1)
# Trim the DataFrame to the desired time range
if not swarm_data.empty:
swarm_data = swarm_data[(swarm_data['Time'] >= start_date)
& (swarm_data['Time'] < end_date)]
if dataset in rename.keys():
swarm_data = swarm_data.rename(columns=rename[dataset])
return swarm_data
[docs]
def load_madrigal(stime, fdir):
"""Load Madrigal TEC data from given time.
Parameters
----------
stime: datetime object
Universal time for the desired madrigal output
fdir : str kwarg
directory where file is located
Returns
-------
mad_dict : dict
Dictionary of the madrigal data including: TEC, geographic latitude,
geographic longitude, TEC error (dTEC), timestamp, and date in the
datetime format
Raises
------
ValueError
If no file was found for the desired date
Notes
-----
This takes in madrgial files of format gps%y%m%dg.002.netCDF4
5 minute cadence
"""
# If Time input is not at midnight, convert it
sday = stime.replace(hour=0, minute=0, second=0, microsecond=0)
search_pattern = os.path.join(fdir, 'gps{:s}g.00*.netCDF4'.format(
sday.strftime("%y%m%d")))
if len(glob.glob(search_pattern)) > 0:
fname = glob.glob(search_pattern)[0]
else:
raise ValueError('No Madrigal File Found for {:}'.format(sday))
# Load the data file
file_id = Dataset(fname)
# Extract the file data
mad_tec = file_id.variables['tec'][:]
mad_gdlat = file_id.variables['gdlat'][:]
mad_glon = file_id.variables['glon'][:]
mad_dtec = file_id.variables['dtec'][:]
mad_time = file_id.variables['timestamps'][:] # every 5 minutes
mad_date_list = np.array([sday + dt.timedelta(minutes=x * 5)
for x in range(288)])
# Format data into a dict
mad_dict = {'time': mad_date_list, 'timestamp': mad_time, 'glon': mad_glon,
'glat': mad_gdlat, 'tec': mad_tec, 'dtec': mad_dtec}
return mad_dict
[docs]
def load_nimo(stime, file_dir, name_format='NIMO_AQ_%Y%j', ne_var='dene',
lon_var='lon', lat_var='lat', alt_var='alt', hr_var='hour',
min_var='minute', tec_var='tec', hmf2_var='hmf2', nmf2_var='nmf2',
time_cadence=15):
"""Load daily NIMO model files.
Parameters
----------
stime : dt.datetime
Day of desired NIMO run
file_dir : str
File directory, wildcards will be resolved but should only result
in one file per day for the specified `name_format`
name_format : str
Format of NIMO file name including date format before .nc
(default='NIMO_AQ_%Y%j')
ne_var : str
Electron density variable name (default='dene')
lon_var : str
Geographic longitude variable name (default='lon')
lat_var : str
Geodetic latitude variable name (default='lat')
alt_var : str
Altitude variable name (default='alt')
hr_var : str
UT hour variable name (default='hour')
min_var : str
UT minute variable name, or '' if not present (default='minute')
tec_var : str
TEC variable name (default='tec')
hmf2_var : str
hmF2 variable name (default='hmf2')
nmf2_var : str
NmF2 variable name (default='nmf2')
time_cadence : int
Model UT output time cadence of data in minutes (default=15)
Returns
-------
nimo_dc : dict
Dictionary with variables dene, glon, glat, alt, hour, minute, date,
tec, nmf2, and hmf2
Raises
------
ValueError
If no NIMO file could be found at the specified location and time
KeyError
If an unexpected variable is supplied
"""
# Use the time to format the file name
name_str = "{:s}.nc".format(stime.strftime(name_format))
# Construct the file path and use glob to resolve any fill values
fname = os.path.join(file_dir, name_str)
fil = glob.glob(fname)
# Ensure only one file was returned, warn user if not
if len(fil) > 0:
nimo_id = Dataset(fil[0])
if len(fil) > 1:
logger.warning(''.join(['multiple NIMO file identified for ',
stime.strftime('%d-%b-%Y'), ', disgarding ',
'{:}'.format(fil[1:])]))
else:
raise ValueError('No NIMO file found for {:} at {:}'.format(
stime, fname))
# Test the input variable keys
for var in [ne_var, tec_var, hmf2_var, hmf2_var, lon_var, lat_var, alt_var,
hr_var]:
if var not in nimo_id.variables.keys():
raise KeyError('Bad input variable {:} not in {:}'.format(
repr(var), nimo_id.variables.keys()))
# Retrieve the desired density variables
nimo_ne = nimo_id.variables[ne_var][:]
nimo_tec = nimo_id.variables[tec_var][:]
nimo_hmf2 = nimo_id.variables[hmf2_var][:]
nimo_nmf2 = nimo_id.variables[nmf2_var][:]
# Get the desired location variables and test for both hemispheres
nimo_lon = nimo_id.variables[lon_var][:]
nimo_lat = nimo_id.variables[lat_var][:]
nimo_alt = nimo_id.variables[alt_var][:]
if np.sign(min(nimo_lat)) != -1:
logger.warning("No Southern latitudes")
elif np.sign(max(nimo_lat)) != 1:
logger.warning("No Northern latitudes")
# Retrieve the desired time variables
nimo_hr = nimo_id.variables[hr_var][:]
if min_var in nimo_id.variables.keys():
nimo_mins = nimo_id.variables[min_var][:]
else:
logger.info('No minute variable, Treating hour as fractional hours')
nimo_mins = np.array([(h % 1) * 60 for h in nimo_hr]).astype(int)
nimo_hr = np.array([int(h) for h in nimo_hr])
# Format the time
sday = stime.replace(hour=nimo_hr[0], minute=nimo_mins[0],
second=0, microsecond=0)
nimo_date_list = np.array([sday + dt.timedelta(minutes=(x - 1)
* time_cadence)
for x in range(len(nimo_ne))])
# Format the output dictionary
nimo_dc = {'time': nimo_date_list, 'dene': nimo_ne, 'glon': nimo_lon,
'glat': nimo_lat, 'alt': nimo_alt, 'hour': nimo_hr,
'minute': nimo_mins, 'tec': nimo_tec, 'hmf2': nimo_hmf2,
'nmf2': nimo_nmf2}
return nimo_dc
[docs]
def load_daily_stats(stime, model, obs, file_dir, **kwargs):
"""Load the daily statistics file with model-data comparisons.
Parameters
----------
stime : datetime
day of desired file
model : str
Case-sensitive name of model requested (e.g., 'NIMO', 'PyIRI').
obs : str
Name of data set requested (e.g., 'SWARM', 'MADRIGAL')
file_dir : str
File directory
kwargs : dict
Optional kwargs by data type. Includes 'mad_lon', which expects
longitudes of either -90 deg E or 60 deg E for Madrigal data.
Returns
-------
stat_data : pd.DataFrame
Dataframe that includes all information from type file
Raises
------
ValueError
If expected file does not exist
"""
# Build the year and date strings
date_dir, fname = build_daily_stats_filename(stime, model, obs, file_dir,
**kwargs)
# Combine the directory and filename
fname = os.path.join(date_dir, fname)
# Test that the file exists
if not os.path.isfile(fname):
raise ValueError('Could not file file: {:s}'.format(fname))
# Open the file
dat = np.genfromtxt(fname, delimiter=None, dtype=None, skip_header=0,
names=True, encoding='utf-8',
missing_values='NaN', filling_values=np.nan)
# If an entire column is np.nan, then genfromtxt cannot interpret the
# dtype, so it assigns it to True. The following code replaces True with
# The original np.nan values
for name in dat.dtype.names:
col = dat[name]
if (col.dtype == bool):
nan_col = np.full(col.shape, np.nan, dtype='float64')
dat = dat.astype([(n, 'float64',) if n == name else
(n, dat.dtype[n]) for n in dat.dtype.names])
dat[name] = nan_col
# Save as a DataFrame
stat_data = pd.DataFrame(dat, columns=dat.dtype.names)
return stat_data
[docs]
def multiday_states_report(date_range, daily_dir, model_name, obs_name,
obs_constraint, comp_type='eia'):
"""Create a state report for a range of dates with a skill check.
Parameters
----------
date_range : pd.DateRange
Date range of desired states files
daily_dir : str
Directory containing the daily files
model_name : str
Case-sensitive model name to load, expects the capitalization used in
the daily stats file header. (e.g., 'Nimo', 'PyIRI')
obs_name : str
Observation name to load (e.g., 'SWARM', 'MADRIGAL')
obs_constraint : str or int
Additional constraint for observation type. For Madrigal, this is the
longitude as a integer. For Swarm, this is the altitude string, which
currently accepts 'swarm', 'hmf2', or '100'.
comp_type : str
Desired type to check against for orientation or EIA state, expecting
one of 'eia', 'peak', 'flat', or 'trough' for EIA state comparison or
one of 'north', 'south', or 'neither' for an orientation comparison
(default='eia')
Returns
-------
model_frame : pd.DataFrame
Model longitude, local time, states, directions, and EIA types. If
Swarm observations are requested, will also contain a satellite list.
obs_frame : pd.DataFrame
Data longitude, local time, states, directions, and EIA types. If
Swarm observations are requested, will also contain a satellite list.
Raises
------
ValueError
For incompatible input combinations
See Also
--------
load_daily_stats
For accepted models, observations, and Madrigal longitudes
"""
# Check to see what we are comparing (states or directions)
if comp_type.lower() in ['north', 'south', 'neither']:
orientation = 'direction'
else:
orientation = 'state'
# Initialize the parameter dicts
mod_dict = {'state': list(), 'direction': list(), 'type': list(),
'Glon': list(), 'LT': list()}
obs_dict = {'state': list(), 'direction': list(), 'type': list(),
'Glon': list(), 'LT': list()}
# Determine the observation-specific inputs
if obs_name.lower() == 'swarm':
load_kwargs = {}
obs_str = 'Swarm_EIA_Type'
mod_dict['Sat'] = list()
obs_dict['Sat'] = list()
if model_name.lower() == 'pyiri':
model_str = 'PyIRI_Type'
else:
if obs_constraint == 'swarm':
model_str = '{:s}_Swarm_Type'.format(model_name)
elif obs_constraint == 'hmf2':
model_str = '{:s}_hmf2_Type'.format(model_name)
elif obs_constraint == '100':
model_str = '{:s}_Swarm100_Type'.format(model_name)
else:
raise ValueError(''.join([
'Unknown altitude type provided in `obs_constraint`: ',
repr(obs_constraint)]))
elif obs_name.lower() == 'madrigal':
obs_str = 'Mad_EIA_Type'
load_kwargs = {'mad_lon': obs_constraint}
if model_name.lower() == 'pyiri':
raise ValueError('Have not yet implemented TEC retrieval in PyIRI')
else:
model_str = '{:s}_Type'.format(model_name)
else:
raise ValueError('Unknown observation source')
# Cycle through each day
for sday in date_range.to_pydatetime():
# Load the desired file
daily_stats = load_daily_stats(sday, model_name.upper(),
obs_name.upper(), daily_dir,
**load_kwargs)
# Clean and save the model data
clean_out = clean_type(daily_stats[model_str].values)
mod_dict['state'].extend(clean_out[0])
mod_dict['direction'].extend(clean_out[1])
mod_dict['type'].extend(daily_stats[model_str].values)
# Clean and save the observed data
clean_out = clean_type(daily_stats[obs_str].values)
obs_dict['state'].extend(clean_out[0])
obs_dict['directon'].extend(clean_out[1])
obs_dict['type'].extend(daily_stats[obs_str].values)
# Save the location information
mod_dict['Glon'].extend(
daily_stats['{:s}_Glon'.format(model_name)].values)
obs_dict['Glon'].extend(
daily_stats['{:s}_Glon'.format(model_name)].values)
mod_dict['Glon'].extend(daily_stats['LT_Hour'].values)
obs_dict['Glon'].extend(daily_stats['LT_Hour'].values)
if 'Sat' in mod_dict.keys():
mod_dict['Sat'].extend(daily_stats['Satellite'].values)
obs_dict['Sat'].extend(daily_stats['Satellite'].values)
# Cast the dictionaries as pandas DataFrames
model_frame = pd.DataFrame(mod_dict)
obs_frame = pd.DataFrame(obs_dict)
# Get the skill score of the model against the observations
model_frame['skill'] = skill_score.state_check(
obs_frame[orientation].values, model_frame[orientation].values,
state=comp_type)
return model_frame, obs_frame