"""
The LiPD class describes a `LiPD (Linked Paleo Data) <https://cp.copernicus.org/articles/12/1093/2016/cp-12-1093-2016.html>`_ object. It contains an `RDF <https://www.w3.org/RDF/>`_ Graph which is serialization of the LiPD data into an RDF graph containing terms from the `LiPD Ontology <http://linked.earth/Ontology/release/core/1.2.0/index-en.html>`
How to browse and query LiPD objects is described in a short example below, while `this notebook <https://nbviewer.jupyter.org/github/LinkedEarth/pylipd/blob/master/example_notebooks/pylipd_tutorial.ipynb>`_ demonstrates how to use PyLiPD to view and query LiPD datasets.
"""
import os
import re
import os.path
import tempfile
import pandas as pd
import random
import string
import io
import numpy as np
import ast
from rdflib import ConjunctiveGraph, URIRef
from tqdm import tqdm
from .globals.queries import QUERY_ALL_VARIABLES_GRAPH, QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION
from .lipd_series import LiPDSeries
from .utils.multi_processing import multi_convert_to_rdf, multi_load_lipd
from .utils.rdf_graph import RDFGraph
from .utils.rdf_to_lipd import RDFToLiPD
from .utils.legacy_utils import LiPD_Legacy
from .utils.utils import sanitizeId
#import bibtexparser
#from bibtexparser.bibdatabase import BibDatabase
from doi2bib import crossref
from pybtex.database import BibliographyData, Entry
from .globals.urls import NSURL
[docs]class LiPD(RDFGraph):
'''The LiPD class describes a `LiPD (Linked Paleo Data) <https://cp.copernicus.org/articles/12/1093/2016/cp-12-1093-2016.html>`_ object. It contains an `RDF <https://www.w3.org/RDF/>`_ Graph which is serialization of the LiPD data into an RDF graph containing terms from the `LiPD Ontology <http://linked.earth/Ontology/release/core/1.2.0/index-en.html>`
How to browse and query LiPD objects is described in a short example below.
Examples
--------
In this example, we read an online LiPD file and convert it into a time series object dictionary.
.. jupyter-execute::
from pylipd.lipd import LiPD
lipd = LiPD()
lipd.load(["https://lipdverse.org/data/LCf20b99dfe8d78840ca60dfb1f832b9ec/1_0_1//Nunalleq.Ledger.2018.lpd"])
ts_list = lipd.get_timeseries(lipd.get_all_dataset_names())
for dsname, tsos in ts_list.items():
for tso in tsos:
if 'paleoData_variableName' in tso:
print(dsname+': '+tso['paleoData_variableName']+': '+tso['archiveType'])
'''
def __init__(self, graph=None):
super().__init__(graph)
[docs] def load_from_dir(self, dir_path, parallel=False, cutoff=None):
'''Load LiPD files from a directory
Parameters
----------
dir_path : str
path to the directory containing lipd files
parallel: bool
(Optional) set to True to process lipd files in parallel. You *must* run this function under the "__main__" process for this to work
cutoff : int
(Optional) the maximum number of files to load at once.
Examples
--------
In this example, we load LiPD files from a directory.
.. jupyter-execute::
from pylipd.lipd import LiPD
lipd = LiPD()
lipd.load_from_dir("../examples/data")
print(lipd.get_all_dataset_names())
'''
if not os.path.isdir(dir_path):
print(f"Directory {dir_path} does not exist")
return
lipdfiles = []
for path in os.listdir(dir_path):
file_path = os.path.join(dir_path, path)
if os.path.isfile(file_path) and path.endswith(".lpd"):
lipdfiles.append(file_path)
if cutoff:
lipdfiles = lipdfiles[0:cutoff]
self.load(lipdfiles, parallel)
# Allows loading http locations
[docs] def load(self, lipdfiles, parallel=False):
'''Load LiPD files.
Parameters
----------
lipdfiles : list of str
array of paths to lipd files (the paths could also be urls)
parallel: bool
(Optional) set to True to process lipd files in parallel. You *must* run this function under the "__main__" process for this to work
Examples
--------
In this example, we load LiPD files for an array of paths.
.. jupyter-execute::
from pylipd.lipd import LiPD
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd",
"../examples/data/Ant-WAIS-Divide.Severinghaus.2012.lpd",
"https://lipdverse.org/data/LCf20b99dfe8d78840ca60dfb1f832b9ec/1_0_1/Nunalleq.Ledger.2018.lpd"
])
print(lipd.get_all_dataset_names())
'''
if type(lipdfiles) is not list:
lipdfiles = [lipdfiles]
numfiles = len(lipdfiles)
print(f"Loading {numfiles} LiPD files")
self.graph = multi_load_lipd(self.graph, lipdfiles, parallel)
print("Loaded..")
#def load_from_lipdverse(self, datasetID, version=None):
[docs] def convert_lipd_dir_to_rdf(self, lipd_dir, rdf_file, parallel=False):
'''Convert a directory containing LiPD files into a single RDF file (to be used for uploading to Knowledge Bases like GraphDB)
Parameters
----------
lipd_dir : str
Path to the directory containing lipd files
rdf_file : str
Path to the output rdf file
'''
filemap = {}
for path in os.listdir(lipd_dir):
fullpath = os.path.join(lipd_dir, path)
tmp_rdf_file = tempfile.NamedTemporaryFile().name
filemap[fullpath] = tmp_rdf_file
print(f"Converting {len(filemap.keys())} LiPD files to RDF..")
multi_convert_to_rdf(filemap, parallel)
print("Conversion to RDF done..")
print("Writing to main RDF file..")
with open(rdf_file, "w") as fout:
for lipdfile in filemap.keys():
tmp_rdf_file = filemap[lipdfile]
if os.path.exists(tmp_rdf_file):
fin = open(tmp_rdf_file, "r")
data = fin.read();
fin.close()
fout.write(data)
os.remove(tmp_rdf_file)
fout.close()
print("Written..")
[docs] def load_remote_datasets(self, dsnames):
'''Loads remote datasets into cache if a remote endpoint is set
Parameters
----------
dsnames : array
array of dataset names
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Fetch LiPD data from remote RDF Graph
lipd_remote = LiPD()
lipd_remote.set_endpoint("https://linkedearth.graphdb.mint.isi.edu/repositories/LiPDVerse2")
lipd_remote.load_remote_datasets(["Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001", "MD98_2181.Stott.2007", "Ant-WAIS-Divide.Severinghaus.2012"])
print(lipd_remote.get_all_dataset_names())
'''
if not self.endpoint:
raise Exception("No remote endpoint")
if type(dsnames) is not list:
dsnames = [dsnames]
if dsnames == None or len(dsnames) == 0:
raise Exception("No dataset names to cache")
dsnamestr = (' '.join('<' + NSURL + "/" + dsname + '>' for dsname in dsnames))
print("Caching datasets from remote endpoint..")
qres, qres_df = self.query(f"SELECT ?s ?p ?o ?g WHERE {{ GRAPH ?g {{ ?s ?p ?o }} VALUES ?g {{ {dsnamestr} }} }}", remote=True)
# Reinitialize graph
# self._initialize_graph()
for row in qres:
self.graph.add((row.s, row.p, row.o, row.g))
print("Done..")
[docs] def update_remote_datasets(self, dsnames):
'''Updates local LiPD Graph for datasets to remote endpoint'''
if not self.endpoint:
raise Exception("No remote endpoint")
# TODO: Implement this
[docs] def get_bibtex(self, remote = True, save = True, path = 'mybiblio.bib', verbose = False):
'''Get BibTeX for loaded datasets
Parameters
----------
remote : bool
(Optional) If set to True, will return the bibliography by checking against the DOI
save : bool
(Optional) Whether to save the bibliography to a file
path : str
(Optional) Path where to save the file
verbose : bool
(Optional) Whether to print out on the console. Note that this option will turn on automatically if saving to a file fails.
Returns
-------
bibs : list
List of BiBTex entry
df : pandas.DataFrame
Bibliography information in a Pandas DataFrame
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Fetch LiPD data from remote RDF Graph
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd"
])
print(lipd.get_bibtex(save=False))
'''
def establish_type(pub_type):
if pub_type:
pub_type = re.sub('-', '', pub_type).lower()
else:
pub_type = 'misc'
if re.match(r".*article.*", pub_type) or re.match(r".*shortcommunication.*", pub_type):
pub_type = 'article'
elif re.match(r".*chapter.*", pub_type) or re.match(r".*book.*", pub_type):
pub_type = 'chapter'
elif re.match(r".*report.*", pub_type):
pub_type = 'report'
else:
pub_type = 'misc'
return pub_type
def make_bib(row):
pub_type = establish_type(row['type'])
# Create a unique citation ID if not given
row = row.fillna("")
if row['citeKey'] is None:
characters = string.ascii_letters + string.digits
citation_key = ''.join(random.choice(characters) for i in range(8))
else:
citation_key = row['citeKey']
entries = [] #start creating the list
if row['authors']:
entries.append(('author', str(row['authors'])))
if row['doi']:
entries.append(('doi',str(row['doi'])))
if row['year']:
entries.append(('year',str(row['year'])))
if row['pubyear']:
entries.append(('year',str(row['pubyear'])))
if row['title']:
if pub_type == 'article' or pub_type == 'misc':
entries.append(('title',str(row['title'])))
elif pub_type == 'chapter' or pub_type == 'report':
entries.append(('chapter', str(row['title'])))
if row['journal']:
if pub_type == 'article':
entries.append(('journal', str(row['journal'])))
if pub_type == 'book':
entries.append(('title', str(row['journal'])))
if row['volume']:
entries.append(('volume', str(row['volume'])))
if row['issue']:
entries.append(('issue', str(row['issue'])))
if row['pages']:
entries.append(('pages', str(row['pages'])))
if row['publisher']:
entries.append(('publisher', str(row['publisher'])))
if row['report']:
entries.append(('title', str(row['report'])))
if row['edition']:
entries.append(('edition', str(row['edition'])))
if row['institution']:
entries.append(('institution',str(row['institution'])))
if row['url']:
entries.append(('url',str(row['url'])))
if row['url2']:
entries.append(('url',str(row['url2'])))
if pub_type == 'article':
bib = BibliographyData({citation_key:Entry('article',entries)})
elif pub_type == 'chapter' or pub_type == 'report':
bib = BibliographyData({citation_key:Entry('inbook',entries)})
elif pub_type == 'misc':
bib = BibliographyData({citation_key:Entry('misc',entries)})
return bib
result, df = self.query(QUERY_BIBLIO)
bibs = []
for idx,row in df.iterrows():
if remote == True:
try:
f = (crossref.get_bib(row['doi']))
if f[0]==True:
bibs.append(f[1])
else:
print("Cannot find a matching record for the provided DOI, creating the entry manually")
bibs.append(make_bib(row).to_string('bibtex'))
except:
print("Cannot parse the provided DOI, creating the entry manually")
bibs.append(make_bib(row).to_string('bibtex'))
if save == True:
try:
with io.open(path, 'w', encoding="utf-8") as bibfile:
for bib in bibs:
bibfile.write("{}\n".format(bib))
except TypeError:
print("Can't save in output file\n")
verbose = True
if verbose == True:
print(bibs)
return bibs, df
[docs] def get_timeseries(self, dsnames, to_dataframe=False):
'''Get Legacy LiPD like Time Series Object (tso)
Parameters
----------
dsnames : list
array of dataset id or name strings
to_dataframe : bool {True; False}
Whether to return a dataframe along the dictionary. Default is False
Returns
-------
ts : dict
A dictionary containing Time Series Object
df : Pandas.DataFrame
If to_dataframe is set to True, returns a queriable Pandas DataFrame
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Fetch LiPD data from remote RDF Graph
lipd_remote = LiPD()
lipd_remote.set_endpoint("https://linkedearth.graphdb.mint.isi.edu/repositories/LiPDVerse2")
ts_list = lipd_remote.get_timeseries(["Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001", "MD98_2181.Stott.2007", "Ant-WAIS-Divide.Severinghaus.2012"])
for dsname, tsos in ts_list.items():
for tso in tsos:
if 'paleoData_variableName' in tso:
print(dsname+': '+tso['paleoData_variableName']+': '+tso['archiveType'])
'''
if type(dsnames)==str:
dsnames=[dsnames]
ts = self._get_timeseries(dsnames)
if to_dataframe == False:
return ts
elif to_dataframe == True:
dict_list =[]
for item in ts.keys():
for dictionary in ts[item]:
dict_list.append(dictionary)
df = pd.DataFrame.from_dict(dict_list, orient='columns')
return ts, df
def _get_timeseries(self, dsnames):
timeseries = {}
for dsname in dsnames:
converter = RDFToLiPD(self.graph)
d = converter.convert_to_json(dsname)
print("Extracting timeseries from dataset: " + dsname + " ...")
if len(d.items()):
tss = LiPD_Legacy().extract(d)
timeseries[dsname] = tss
return timeseries
[docs] def get_timeseries_essentials(self, dsname = None, mode='paleo'):
''' Returns specific properties for timeseries: 'dataSetName', 'archiveType', 'geo_meanLat', 'geo_meanLon',
'geo_meanElev', 'paleoData_variableName', 'paleoData_values',
'paleoData_units', 'paleoData_proxy' (paleo only), 'paleoData_proxyGeneral' (paleo only),
'time_variableName', 'time_values', 'time_units', 'depth_variableName',
'depth_values', 'depth_units'
Parameters
----------
dsname : str, optional
The name of the dataset for which to return the timeseries information. The default is None.
mode : paleo, chron
Whether to retrun the information stored in the PaleoMeasurementTable or the ChronMeasurementTable. The default is 'paleo'.
Raises
------
ValueError
Need to select either 'chron' or 'paleo'
Returns
-------
qres_df : pandas.DataFrame
A pandas dataframe returning the properties in columns for each series stored in a row of the dataframe
Example
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_datasets
lipd = load_datasets('ODP846.Lawrence.2006.lpd')
df_paleo = lipd.get_timeseries_essentials(mode='paleo')
print(df_paleo)
To return the information stored in the ChronTable:
.. jupyter-execute::
from pylipd.utils.dataset import load_datasets
lipd = load_datasets('ODP846.Lawrence.2006.lpd')
df_chron = lipd.get_timeseries_essentials(mode='chron')
print(df_chron)
'''
if dsname is None:
dsname= ''
if mode == 'paleo':
query = QUERY_TIMESERIES_ESSENTIALS_PALEO
query = query.replace("[dsname]", dsname)
elif mode == 'chron':
query = QUERY_TIMESERIES_ESSENTIALS_CHRON
query = query.replace("[dsname]", dsname)
else:
raise ValueError("The mode should be either 'paleo' or 'chron'")
qres, qres_df = self.query(query)
try:
qres_df['paleoData_values']=qres_df['paleoData_values'].apply(lambda row : np.fromstring(row.strip("[]"), sep=','))
except:
qres_df['chronData_values']=qres_df['chronData_values'].apply(lambda row : np.fromstring(row.strip("[]"), sep=','))
qres_df['time_values']=qres_df['time_values'].apply(lambda x : np.fromstring(x.strip("[]"), sep=',') if x is not None else None)
qres_df['depth_values']=qres_df['depth_values'].apply(lambda x : np.fromstring(x.strip("[]"), sep=',') if x is not None else None)
return qres_df
[docs] def get_lipd(self, dsname):
'''Get LiPD json for a dataset
Parameters
----------
dsname : str
dataset id
Returns
-------
lipdjson : dict
LiPD json
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load a local LiPD file
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
])
lipd_json = lipd.get_lipd(lipd.get_all_dataset_names()[0])
print(lipd_json)
'''
converter = RDFToLiPD(self.graph)
return converter.convert_to_json(dsname)
[docs] def create_lipd(self, dsname, lipdfile):
'''Create LiPD file for a dataset
Parameters
----------
dsname : str
dataset id
lipdfile: str
path to LiPD file
Returns
-------
lipdjson : dict
LiPD json
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load a local file
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
])
dsname = lipd.get_all_dataset_names()[0]
lipd.create_lipd(dsname, "test.lpd")
'''
converter = RDFToLiPD(self.graph)
return converter.convert(dsname, lipdfile)
[docs] def get(self, dsnames):
'''Gets dataset(s) from the graph and returns the popped LiPD object
Parameters
----------
dsnames : str or list of str
dataset name(s) to get.
Returns
-------
pylipd.lipd.LiPD
LiPD object with the retrieved dataset(s)
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load LiPD files from a local directory
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd"
])
all_datasets = lipd.get_all_dataset_names()
print("Loaded datasets: " + str(all_datasets))
ds = lipd.get(all_datasets[0])
print("Got dataset: " + str(ds.get_all_dataset_names()))
'''
dsnames = [dsnames] if type(dsnames) is not list else dsnames
dsids = [(f"{NSURL}/{dsname}" if not dsname.startswith(NSURL) else dsname) for dsname in dsnames]
ds = super().get(dsids)
return LiPD(ds.graph)
[docs] def pop(self, dsnames):
'''Pops dataset(s) from the graph and returns the popped LiPD object
Parameters
----------
dsnames : str or list of str
dataset name(s) to be popped.
Returns
-------
pylipd.lipd.LiPD
LiPD object with the popped dataset(s)
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load local files
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd"
])
all_datasets = lipd.get_all_dataset_names()
print("Loaded datasets: " + str(all_datasets))
popped = lipd.pop(all_datasets[0])
print("Loaded datasets after pop: " + str(lipd.get_all_dataset_names()))
print("Popped dataset: " + str(popped.get_all_dataset_names()))
'''
dsnames = [dsnames] if type(dsnames) is not list else dsnames
dsids = [(f"{NSURL}/{dsname}" if not dsname.startswith(NSURL) else dsname) for dsname in dsnames]
popped = super().pop(dsids)
return LiPD(popped.graph)
[docs] def remove(self, dsnames):
'''Removes dataset(s) from the graph
Parameters
----------
dsnames : str or list of str
dataset name(s) to be removed
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load local files
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd"
])
all_datasets = lipd.get_all_dataset_names()
print("Loaded datasets: " + str(all_datasets))
lipd.remove(all_datasets[0])
print("Loaded datasets after remove: " + str(lipd.get_all_dataset_names()))
'''
dsnames = [dsnames] if type(dsnames) is not list else dsnames
dsids = [(f"{NSURL}/{dsname}" if not dsname.startswith(NSURL) else dsname) for dsname in dsnames]
super().remove(dsids)
[docs] def get_all_dataset_names(self):
'''Get all Dataset Names
Returns
-------
dsnames : list
A list of datasetnames
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load local files
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd"
])
print(lipd.get_all_dataset_names())
'''
qres, qres_df = self.query(QUERY_DSNAME)
return [sanitizeId(row.dsname) for row in qres]
[docs] def get_all_dataset_ids(self):
'''Get all Dataset ids
Returns
-------
dsids : list
A list of datasetnames
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load local files
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd"
])
print(lipd.get_all_dataset_ids())
'''
qres, qres_df = self.query(QUERY_DSID)
return [sanitizeId(row.dsid) for row in qres]
[docs] def get_all_archiveTypes(self):
'''
Returns a list of all the unique archiveTypes present in the LiPD object
Returns
-------
list
A list of archiveTypes
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load Local files
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd"
])
print(lipd.get_all_archiveTypes())
'''
qres, qres_df = self.query(QUERY_UNIQUE_ARCHIVE_TYPE)
return [str(row.archiveType) for row in qres]
[docs] def get_all_locations(self, dsname = None):
'''Return geographical coordinates for all the datasets.
Parameters
----------
dsname : str, optional
The name of the dataset for which to return the timeseries information. The default is None.
Returns
-------
df : pandas.DataFrame
A pandas dataframe returning the latitude, longitude and elevation for each dataset
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
df = lipd.get_all_locations()
print(df)
'''
if dsname is None:
dsname= ''
query = QUERY_LOCATION
query = query.replace("[dsname]", dsname)
return self.query(query)[1]
[docs] def get_ensemble_tables(self, dsname = None, ensembleVarName = None, ensembleDepthVarName = 'depth'):
'''Gets ensemble tables from the LiPD graph
Parameters
----------
dsname : str
The name of the dataset if you wish to analyse one at a time (Set to ".*" to match all datasets with a common root)
ensembleVarName : None or str
ensemble variable name. Default is None, which searches for names that contain "year" or "age" (Set to ".*" to match all ensemble variable names)
ensembleDepthVarName : str
ensemble depth variable name. Default is 'depth' (Set to ".*" to match all ensemble depth variable names)
Returns
-------
ensemble_tables : dataframe
A dataframe containing the ensemble tables
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
lipd = LiPD()
lipd.load([
"../examples/data/ODP846.Lawrence.2006.lpd"
])
all_datasets = lipd.get_all_dataset_names()
print("Loaded datasets: " + str(all_datasets))
ens_df = lipd.get_ensemble_tables(
ensembleVarName="age",
ensembleDepthVarName="depth"
)
print(ens_df)
'''
if dsname is None:
dsname = ''
if ensembleVarName is None:
query = QUERY_ENSEMBLE_TABLE_SHORT
query = query.replace("[dsname]", dsname)
query = query.replace("[ensembleDepthVarName]", ensembleDepthVarName)
else:
query = QUERY_ENSEMBLE_TABLE
query = query.replace("[dsname]", dsname)
query = query.replace("[ensembleVarName]", ensembleVarName)
query = query.replace("[ensembleDepthVarName]", ensembleDepthVarName)
qres, qres_df = self.query(query)
qres_df['ensembleDepthValues']=qres_df['ensembleDepthValues'].apply(lambda row : np.fromstring(row.strip("[]"), sep=','))
qres_df['ensembleVariableValues']=qres_df['ensembleVariableValues'].apply(lambda row : np.array(ast.literal_eval(row)))
return qres_df
[docs] def get_all_variables(self):
'''
Returns a list of all variables in the graph
Returns
-------
pandas.DataFrame
A dataframe of all variables in the graph with columns uri, varid, varname
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
lipd = LiPD()
lipd.load([
"../examples/data/ODP846.Lawrence.2006.lpd"
])
df = lipd.get_all_variables()
print(df)
'''
return self.query(QUERY_VARIABLE)[1]
[docs] def get_all_variable_names(self):
"""
Get a list of all possible distinct variableNames. Useful for filtering and qeurying.
Returns
-------
list
A list of unique variableName
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
varName = lipd.get_all_variable_names()
print(varName)
"""
return self.query(QUERY_DISTINCT_VARIABLE)[1].iloc[:,0].values.tolist()
[docs] def get_dataset_properties(self):
"""Get a list of unique properties attached to a dataset.
Note: Some properties will return another object (e.g., 'publishedIn' will give you a Publication object with its own properties)
Note: Not all datasets will have the same available properties (i.e., not filled in by a user)
Returns
-------
clean_list : list
A list of avialable properties that can queried
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir(name='Pages2k')
dataset_properties = lipd.get_dataset_properties()
print(dataset_properties)
"""
query_list = self.query(QUERY_DATASET_PROPERTIES)[1].iloc[:,0].values.tolist()
clean_list = [item.split("#")[-1] for item in query_list]
return clean_list
[docs] def get_variable_properties(self):
'''Get a list of variable properties that can be used for querying
Returns
-------
list
A list of unique variable properties
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir(name='Pages2k')
variable_properties = lipd.get_variable_properties()
print(variable_properties)
'''
query_list = self.query(QUERY_VARIABLE_PROPERTIES)[1].iloc[:,0].values.tolist()
clean_list = [item.split("#")[-1] for item in query_list]
return clean_list
[docs] def get_model_properties(self):
'''Get all the properties associated with a model
Returns
-------
List
A list of unique properties attached to models
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_datasets
lipd = load_datasets(names='ODP846')
model_properties = lipd.get_model_properties()
print(model_properties)
'''
query_list = self.query(QUERY_MODEL_PROPERTIES)[1].iloc[:,0].values.tolist()
clean_list = [item.split("#")[-1] for item in query_list]
return clean_list
[docs] def to_lipd_series(self, parallel=False):
'''
Converts the LiPD object to a LiPDSeries object
Parameters
----------
parallel : bool
Whether to use parallel processing to load the data. Default is False
Returns
-------
pylipd.lipd.LiPDSeries
A LiPDSeries object
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
lipd = LiPD()
lipd.load([
"../examples/data/ODP846.Lawrence.2006.lpd"
])
S = lipd.to_lipd_series()
'''
S = LiPDSeries()
S.load(self, parallel)
return S
# bbox = left,bottom,right,top
# bbox = min Longitude , min Latitude , max Longitude , max Latitude
[docs] def filter_by_geo_bbox(self, lonMin, latMin, lonMax, latMax):
'''
Filters datasets to return a new LiPD object that only keeps datasets that fall within the bounding box
Parameters
----------
lonMin : float
Minimum longitude
latMin : float
Minimum latitude
lonMax : float
Maximum longitude
latMax : float
Maximum latitude
Returns
-------
pylipd.lipd.LiPD
A new LiPD object that only contains datasets that fall within the bounding box
Examples
--------
pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method.
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir()
Lfiltered = lipd.filter_by_geo_bbox(0,25,50,50)
Lfiltered.get_all_dataset_names()
'''
query = QUERY_FILTER_GEO
query = query.replace("[lonMin]", str(lonMin))
query = query.replace("[latMin]", str(latMin))
query = query.replace("[lonMax]", str(lonMax))
query = query.replace("[latMax]", str(latMax))
qres, qres_df = self.query(query)
dsnames = [sanitizeId(row.dsname) for row in qres]
return self.get(dsnames)
[docs] def filter_by_archive_type(self, archiveType):
'''
Filters datasets to return a new LiPD object that only keeps datasets that have the specified archive type
Parameters
----------
archiveType : str
The archive type to filter by
Returns
-------
pylipd.lipd.LiPD
A new LiPD object that only contains datasets that have the specified archive type (regex)
Examples
--------
pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method.
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
Lfiltered = lipd.filter_by_archive_type('marine')
Lfiltered.get_all_dataset_names()
'''
query = QUERY_FILTER_ARCHIVE_TYPE
query = query.replace("[archiveType]", archiveType)
qres, qres_df = self.query(query)
dsnames = [sanitizeId(row.dsname) for row in qres]
return self.get(dsnames)