"""
The LiPD class describes a `LiPD (Linked Paleo Data) <https://cp.copernicus.org/articles/12/1093/2016/cp-12-1093-2016.html>`_ object. It contains an `RDF <https://www.w3.org/RDF/>`_ Graph which is serialization of the LiPD data into an RDF graph containing terms from the `LiPD Ontology <http://linked.earth/Ontology/release/core/1.2.0/index-en.html>`
How to browse and query LiPD objects is described in a short example below, while `this notebook <https://nbviewer.jupyter.org/github/LinkedEarth/pylipd/blob/master/example_notebooks/pylipd_tutorial.ipynb>`_ demonstrates how to use PyLiPD to view and query LiPD datasets.
"""
import ast
import os
import re
import os.path
import tempfile
import pandas as pd
import random
import string
import io
import numpy as np
import json
import uuid
from pylipd.classes.dataset import Dataset
from pylipd.utils.json_to_rdf import JSONToRDF
from pylipd.utils.rdf_to_json import RDFToJSON
from .globals.queries import QUERY_FILTER_TIME, QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION, QUERY_FILTER_DATASET_NAME, QUERY_FILTER_COMPILATION, QUERY_COMPILATION_NAME
from .lipd_series import LiPDSeries
from .utils.multi_processing import multi_convert_to_rdf, multi_load_lipd
from .utils.rdf_graph import RDFGraph
from .utils.rdf_to_lipd import RDFToLiPD
from .utils.legacy_utils import LiPD_Legacy
from .utils.utils import sanitizeId
#import bibtexparser
#from bibtexparser.bibdatabase import BibDatabase
from doi2bib import crossref
from pybtex.database import BibliographyData, Entry
from .globals.urls import NSURL, DEFAULT_GRAPH_URI
[docs]
class LiPD(RDFGraph):
'''The LiPD class describes a `LiPD (Linked Paleo Data) <https://cp.copernicus.org/articles/12/1093/2016/cp-12-1093-2016.html>`_ object. It contains an `RDF <https://www.w3.org/RDF/>`_ Graph which is serialization of the LiPD data into an RDF graph containing terms from the `LiPD Ontology <http://linked.earth/Ontology/release/core/1.2.0/index-en.html>`_.
How to browse and query LiPD objects is described in a short example below.
Examples
--------
In this example, we read an online LiPD file and convert it into a time series object dictionary.
.. jupyter-execute::
from pylipd.lipd import LiPD
lipd = LiPD()
lipd.load(["https://lipdverse.org/data/LCf20b99dfe8d78840ca60dfb1f832b9ec/1_0_1//Nunalleq.Ledger.2018.lpd"])
ts_list = lipd.get_timeseries(lipd.get_all_dataset_names())
for dsname, tsos in ts_list.items():
for tso in tsos:
if 'paleoData_variableName' in tso:
print(dsname+': '+tso['paleoData_variableName']+': '+tso['archiveType'])
'''
def __init__(self, graph=None):
super().__init__(graph)
[docs]
def load_from_dir(self, dir_path, parallel=False, cutoff=None, standardize=True, add_labels=True):
'''Load LiPD files from a directory
Parameters
----------
dir_path : str
path to the directory containing lipd files
parallel: bool
(Optional) set to True to process lipd files in parallel. You *must* run this function under the "__main__" process for this to work
cutoff : int
(Optional) the maximum number of files to load at once.
Examples
--------
In this example, we load LiPD files from a directory.
.. jupyter-execute::
from pylipd.lipd import LiPD
lipd = LiPD()
lipd.load_from_dir("../examples/data")
print(lipd.get_all_dataset_names())
'''
if not os.path.isdir(dir_path):
print(f"Directory {dir_path} does not exist")
return
lipdfiles = []
for path in os.listdir(dir_path):
file_path = os.path.join(dir_path, path)
if os.path.isfile(file_path) and path.endswith(".lpd"):
lipdfiles.append(file_path)
if cutoff:
lipdfiles = lipdfiles[0:cutoff]
self.load(lipdfiles, parallel, standardize, add_labels)
# Allows loading http locations
[docs]
def load(self, lipdfiles, parallel=False, standardize=True, add_labels=True):
'''Load LiPD files.
Parameters
----------
lipdfiles : list of str
array of paths to lipd files (the paths could also be urls)
parallel: bool
(Optional) set to True to process lipd files in parallel. You *must* run this function under the "__main__" process for this to work
Examples
--------
In this example, we load LiPD files for an array of paths.
.. jupyter-execute::
from pylipd.lipd import LiPD
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd",
"../examples/data/Ant-WAIS-Divide.Severinghaus.2012.lpd",
"https://lipdverse.org/data/LCf20b99dfe8d78840ca60dfb1f832b9ec/1_0_1/Nunalleq.Ledger.2018.lpd"
])
print(lipd.get_all_dataset_names())
'''
if type(lipdfiles) is not list:
lipdfiles = [lipdfiles]
numfiles = len(lipdfiles)
print(f"Loading {numfiles} LiPD files")
self.graph = multi_load_lipd(self.graph, lipdfiles, parallel, standardize, add_labels)
print("Loaded..")
#def load_from_lipdverse(self, datasetID, version=None):
[docs]
def convert_lipd_dir_to_rdf(self, lipd_dir, rdf_file, parallel=False, standardize=True, add_labels=False):
'''Convert a directory containing LiPD files into a single RDF file (to be used for uploading to Knowledge Bases like GraphDB)
Parameters
----------
lipd_dir : str
Path to the directory containing lipd files
rdf_file : str
Path to the output rdf file
'''
filemap = {}
for path in os.listdir(lipd_dir):
fullpath = os.path.join(lipd_dir, path)
tmp_rdf_file = tempfile.NamedTemporaryFile().name
filemap[fullpath] = tmp_rdf_file
print(f"Converting {len(filemap.keys())} LiPD files to RDF..")
multi_convert_to_rdf(filemap, parallel, standardize, add_labels)
print("Conversion to RDF done..")
print("Writing to main RDF file..")
with open(rdf_file, "w") as fout:
for lipdfile in filemap.keys():
tmp_rdf_file = filemap[lipdfile]
if os.path.exists(tmp_rdf_file):
fin = open(tmp_rdf_file, "r")
data = fin.read();
fin.close()
fout.write(data)
os.remove(tmp_rdf_file)
fout.close()
print("Written..")
[docs]
def load_remote_datasets(self, dsnames, load_default_graph=True):
'''Loads remote datasets into cache if a remote endpoint is set
Parameters
----------
dsnames : array
array of dataset names
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Fetch LiPD data from remote RDF Graph
lipd_remote = LiPD()
lipd_remote.set_endpoint("https://linkedearth.graphdb.mint.isi.edu/repositories/LiPDVerse-dynamic")
lipd_remote.load_remote_datasets(["Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001", "MD98_2181.Stott.2007", "Ant-WAIS-Divide.Severinghaus.2012"])
print(lipd_remote.get_all_dataset_names())
'''
if not self.endpoint:
raise Exception("No remote endpoint")
if type(dsnames) is not list:
dsnames = [dsnames]
if dsnames == None or len(dsnames) == 0:
raise Exception("No dataset names to cache")
dsnamestr = (' '.join('<' + NSURL + "/" + dsname + '>' for dsname in dsnames))
if load_default_graph:
dsnamestr += f" <{DEFAULT_GRAPH_URI}>"
print("Caching datasets from remote endpoint..")
qres, qres_df = self.query(f"SELECT ?s ?p ?o ?g WHERE {{ GRAPH ?g {{ ?s ?p ?o }} VALUES ?g {{ {dsnamestr} }} }}", remote=True)
# Reinitialize graph
# self._initialize_graph()
for row in qres:
self.graph.add((row.s, row.p, row.o, row.g))
print("Done..")
[docs]
def update_remote_datasets(self, dsnames):
'''Updates local LiPD Graph for datasets to remote endpoint'''
if not self.endpoint:
raise Exception("No remote endpoint")
# TODO: Implement this
[docs]
def get_bibtex(self, remote = True, save = True, path = 'mybiblio.bib', verbose = False):
'''Get BibTeX for loaded datasets
Parameters
----------
remote : bool
(Optional) If set to True, will return the bibliography by checking against the DOI
save : bool
(Optional) Whether to save the bibliography to a file
path : str
(Optional) Path where to save the file
verbose : bool
(Optional) Whether to print out on the console. Note that this option will turn on automatically if saving to a file fails.
Returns
-------
bibs : list
List of BiBTex entry
df : pandas.DataFrame
Bibliography information in a Pandas DataFrame
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Fetch LiPD data from remote RDF Graph
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd"
])
print(lipd.get_bibtex(save=False))
'''
def establish_type(pub_type):
# Handle None and NaN values
if pd.isna(pub_type) or pub_type is None or pub_type == '':
pub_type = 'misc'
else:
pub_type = str(pub_type) # Ensure it's a string
pub_type = re.sub('-', '', pub_type).lower()
if re.match(r".*article.*", pub_type) or re.match(r".*shortcommunication.*", pub_type):
pub_type = 'article'
elif re.match(r".*chapter.*", pub_type) or re.match(r".*book.*", pub_type):
pub_type = 'chapter'
elif re.match(r".*report.*", pub_type):
pub_type = 'report'
else:
pub_type = 'misc'
return pub_type
# def establish_type(pub_type):
# if pub_type:
# pub_type = re.sub('-', '', pub_type).lower()
# else:
# pub_type = 'misc'
# if re.match(r".*article.*", pub_type) or re.match(r".*shortcommunication.*", pub_type):
# pub_type = 'article'
# elif re.match(r".*chapter.*", pub_type) or re.match(r".*book.*", pub_type):
# pub_type = 'chapter'
# elif re.match(r".*report.*", pub_type):
# pub_type = 'report'
# else:
# pub_type = 'misc'
# return pub_type
def make_bib(row):
pub_type = establish_type(row['type'])
# Create a unique citation ID if not given
row = row.fillna("")
if row['citeKey'] is None:
characters = string.ascii_letters + string.digits
citation_key = ''.join(random.choice(characters) for i in range(8))
else:
citation_key = row['citeKey']
entries = [] #start creating the list
if row['authors']:
entries.append(('author', str(row['authors'])))
if row['doi']:
entries.append(('doi',str(row['doi'])))
if row['year']:
entries.append(('year',str(row['year'])))
if row['pubyear']:
entries.append(('year',str(row['pubyear'])))
if row['title']:
if pub_type == 'article' or pub_type == 'misc':
entries.append(('title',str(row['title'])))
elif pub_type == 'chapter' or pub_type == 'report':
entries.append(('chapter', str(row['title'])))
if row['journal']:
if pub_type == 'article':
entries.append(('journal', str(row['journal'])))
if pub_type == 'book':
entries.append(('title', str(row['journal'])))
if row['volume']:
entries.append(('volume', str(row['volume'])))
if row['issue']:
entries.append(('issue', str(row['issue'])))
if row['pages']:
entries.append(('pages', str(row['pages'])))
if row['publisher']:
entries.append(('publisher', str(row['publisher'])))
if row['report']:
entries.append(('title', str(row['report'])))
if row['edition']:
entries.append(('edition', str(row['edition'])))
if row['institution']:
entries.append(('institution',str(row['institution'])))
if row['url']:
entries.append(('url',str(row['url'])))
if row['url2']:
entries.append(('url',str(row['url2'])))
if pub_type == 'article':
bib = BibliographyData({citation_key:Entry('article',entries)})
elif pub_type == 'chapter' or pub_type == 'report':
bib = BibliographyData({citation_key:Entry('inbook',entries)})
elif pub_type == 'misc':
bib = BibliographyData({citation_key:Entry('misc',entries)})
return bib
result, df = self.query(QUERY_BIBLIO)
bibs = []
for idx,row in df.iterrows():
if remote == True:
try:
f = (crossref.get_bib(row['doi']))
if f[0]==True:
bibs.append(f[1])
else:
print(f"Cannot find a matching record for the provided DOI ({row['doi']}), creating the entry manually")
bibs.append(make_bib(row).to_string('bibtex'))
except:
print("Cannot parse the provided DOI, creating the entry manually")
bibs.append(make_bib(row).to_string('bibtex'))
if save == True:
try:
with io.open(path, 'w', encoding="utf-8") as bibfile:
for bib in bibs:
bibfile.write("{}\n".format(bib))
except TypeError:
print("Can't save in output file\n")
verbose = True
if verbose == True:
print(bibs)
return bibs, df
[docs]
def get_timeseries(self, dsnames, to_dataframe=False, mode="paleo", time="age"):
'''Get Legacy LiPD like Time Series Object (tso)
This function is meant to provide legacy support to the older version of the LiPD utilities, which returns a dictionary of timeseries objects. The function also supports returning to a pandas.DataFrame, essentially flattening all the information. This is useful to explore all possible properties but can be slow for large number of datasets or if you only require some standard information. In this case, use `get_timeseries_essentials`.
Parameters
----------
dsnames : list
array of dataset id or name strings
to_dataframe : bool {True; False}
Whether to return a dataframe along the dictionary. Default is False
mode: 'paleo' or 'chron'
Whether to return information from the PaleoData or ChronData objects
time: 'age' or 'year'
Whether the time is expressed as year or age
Returns
-------
ts : dict
A dictionary containing Time Series Object
df : Pandas.DataFrame
If to_dataframe is set to True, returns a queryable Pandas DataFrame
Examples
--------
To only return a list of timeseries objects
.. jupyter-execute::
from pylipd.lipd import LiPD
# Fetch LiPD data from remote RDF Graph
lipd_remote = LiPD()
lipd_remote.set_endpoint("https://linkedearth.graphdb.mint.isi.edu/repositories/LiPDVerse-dynamic")
ts_list = lipd_remote.get_timeseries(["Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001", "MD98_2181.Stott.2007", "Ant-WAIS-Divide.Severinghaus.2012"])
for dsname, tsos in ts_list.items():
for tso in tsos:
if 'paleoData_variableName' in tso:
print(dsname+': '+tso['paleoData_variableName']+': '+tso['archiveType'])
To return a dataframe in addition to the list of timeseries objects
.. jupyter-execute::
from pylipd.lipd import LiPD
lipd_remote = LiPD()
lipd_remote.set_endpoint("https://linkedearth.graphdb.mint.isi.edu/repositories/LiPDVerse-dynamic")
ts_list, df = lipd_remote.get_timeseries(["Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001", "MD98_2181.Stott.2007", "Ant-WAIS-Divide.Severinghaus.2012"], to_dataframe = True)
df.head()
See also
--------
pylipd.lipd.LiPD.get_timeseries_essentials
'''
if type(dsnames)==str:
dsnames=[dsnames]
ts = self._get_timeseries(dsnames, mode=mode, time=time)
if to_dataframe == False:
return ts
elif to_dataframe == True:
dict_list =[]
for item in ts.keys():
for dictionary in ts[item]:
dict_list.append(dictionary)
df = pd.DataFrame.from_dict(dict_list, orient='columns')
return ts, df
def _get_timeseries(self, dsnames, mode="paleo", time="age"):
timeseries = {}
for dsname in dsnames:
converter = RDFToLiPD(self.graph)
d = converter.convert_to_json(dsname)
print("Extracting timeseries from dataset: " + dsname + " ...")
if len(d.items()):
tss = LiPD_Legacy().extract(d, mode=mode, time=time)
timeseries[dsname] = tss
return timeseries
[docs]
def get_timeseries_essentials(self, dsnames=None, mode='paleo'):
''' Returns specific properties for timeseries: 'dataSetName', 'archiveType', 'geo_meanLat', 'geo_meanLon',
'geo_meanElev', 'paleoData_variableName', 'paleoData_values',
'paleoData_units', 'paleoData_proxy' (paleo only), 'paleoData_proxyGeneral' (paleo only),
'time_variableName', 'time_values', 'time_units', 'depth_variableName',
'depth_values', 'depth_units'
Parameters
----------
dsnames : list
array of dataset id or name strings
mode : paleo, chron
Whether to return the information stored in the PaleoMeasurementTable or the ChronMeasurementTable. The default is 'paleo'.
Raises
------
ValueError
Need to select either 'chron' or 'paleo'
Returns
-------
qres_df : pandas.DataFrame
A pandas dataframe returning the properties in columns for each series stored in a row of the dataframe
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_datasets
lipd = load_datasets('ODP846.Lawrence.2006.lpd')
df_paleo = lipd.get_timeseries_essentials(mode='paleo')
print(df_paleo)
To return the information stored in the ChronTable:
.. jupyter-execute::
from pylipd.utils.dataset import load_datasets
lipd = load_datasets('ODP846.Lawrence.2006.lpd')
df_chron = lipd.get_timeseries_essentials(mode='chron')
print(df_chron)
'''
if dsnames is None:
dsnames= ''
if type(dsnames)==str:
dsnames=[dsnames]
qres_df = None
for dsname in dsnames:
if mode == 'paleo':
query = QUERY_TIMESERIES_ESSENTIALS_PALEO
query = query.replace("[dsname]", dsname)
elif mode == 'chron':
query = QUERY_TIMESERIES_ESSENTIALS_CHRON
query = query.replace("[dsname]", dsname)
else:
raise ValueError("The mode should be either 'paleo' or 'chron'")
qres, qtmp_df = self.query(query)
try:
#qtmp_df['paleoData_values']=qtmp_df['paleoData_values'].apply(lambda row : np.array(json.loads(row)))
qtmp_df['paleoData_values']=qtmp_df['paleoData_values'].apply(lambda x : np.array(json.loads(x)) if (x is not None and isinstance(x, str)) else None)
except:
#qtmp_df['chronData_values']=qtmp_df['chronData_values'].apply(lambda row : np.array(json.loads(row)))
qtmp_df['chronData_values']=qtmp_df['chronData_values'].apply(lambda x : np.array(json.loads(x)) if (x is not None and isinstance(x, str)) else None)
#qtmp_df['time_values']=qtmp_df['time_values'].apply(lambda x : np.array(json.loads(x)) if x is not None else None)
#qtmp_df['depth_values']=qtmp_df['depth_values'].apply(lambda x : np.array(json.loads(x)) if x is not None else None)
qtmp_df['depth_values']=qtmp_df['depth_values'].apply(lambda x : np.array(json.loads(x)) if (x is not None and isinstance(x, str)) else None)
qtmp_df['time_values']=qtmp_df['time_values'].apply(lambda x : np.array(json.loads(x)) if (x is not None and isinstance(x, str)) else None)
if qres_df is None:
qres_df = qtmp_df
else:
qres_df = pd.concat([qres_df, qtmp_df], ignore_index=True)
return qres_df
[docs]
def get_lipd(self, dsname):
'''Get LiPD json for a dataset
Parameters
----------
dsname : str
dataset id
Returns
-------
lipdjson : dict
LiPD json
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load a local LiPD file
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
])
lipd_json = lipd.get_lipd(lipd.get_all_dataset_names()[0])
print(lipd_json)
'''
converter = RDFToLiPD(self.graph)
return converter.convert_to_json(dsname)
[docs]
def create_lipd(self, dsname, lipdfile):
'''Create LiPD file for a dataset
Parameters
----------
dsname : str
dataset id
lipdfile: str
path to LiPD file
Returns
-------
lipdjson : dict
LiPD json
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load a local file
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
])
dsname = lipd.get_all_dataset_names()[0]
lipd.create_lipd(dsname, "test.lpd")
'''
converter = RDFToLiPD(self.graph)
return converter.convert(dsname, lipdfile)
[docs]
def get(self, dsnames):
'''Gets dataset(s) from the graph and returns the popped LiPD object
Parameters
----------
dsnames : str or list of str
dataset name(s) to get.
Returns
-------
pylipd.lipd.LiPD
LiPD object with the retrieved dataset(s)
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load LiPD files from a local directory
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd"
])
all_datasets = lipd.get_all_dataset_names()
print("Loaded datasets: " + str(all_datasets))
ds = lipd.get(all_datasets[0])
print("Got dataset: " + str(ds.get_all_dataset_names()))
'''
dsnames = [dsnames] if type(dsnames) is not list else dsnames
dsids = [(f"{NSURL}/{dsname}" if not dsname.startswith(NSURL) else dsname) for dsname in dsnames]
ds = super().get(dsids)
return LiPD(ds.graph)
[docs]
def pop(self, dsnames):
'''Pops dataset(s) from the graph and returns the popped LiPD object
Parameters
----------
dsnames : str or list of str
dataset name(s) to be popped.
Returns
-------
pylipd.lipd.LiPD
LiPD object with the popped dataset(s)
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load local files
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd"
])
all_datasets = lipd.get_all_dataset_names()
print("Loaded datasets: " + str(all_datasets))
popped = lipd.pop(all_datasets[0])
print("Loaded datasets after pop: " + str(lipd.get_all_dataset_names()))
print("Popped dataset: " + str(popped.get_all_dataset_names()))
'''
dsnames = [dsnames] if type(dsnames) is not list else dsnames
dsids = [(f"{NSURL}/{dsname}" if not dsname.startswith(NSURL) else dsname) for dsname in dsnames]
popped = super().pop(dsids)
return LiPD(popped.graph)
[docs]
def remove(self, dsnames):
'''Removes dataset(s) from the graph
Parameters
----------
dsnames : str or list of str
dataset name(s) to be removed
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load local files
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd"
])
all_datasets = lipd.get_all_dataset_names()
print("Loaded datasets: " + str(all_datasets))
lipd.remove(all_datasets[0])
print("Loaded datasets after remove: " + str(lipd.get_all_dataset_names()))
'''
dsnames = [dsnames] if type(dsnames) is not list else dsnames
dsids = [(f"{NSURL}/{dsname}" if not dsname.startswith(NSURL) else dsname) for dsname in dsnames]
super().remove(dsids)
[docs]
def get_all_dataset_names(self):
'''Get all Dataset Names
Returns
-------
dsnames : list
A list of datasetnames
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load local files
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd"
])
print(lipd.get_all_dataset_names())
'''
qres, qres_df = self.query(QUERY_DSNAME)
return [sanitizeId(row.dsname) for row in qres]
[docs]
def get_all_dataset_ids(self):
'''Get all Dataset ids
Returns
-------
dsids : list
A list of datasetnames
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load local files
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd"
])
print(lipd.get_all_dataset_ids())
'''
qres, qres_df = self.query(QUERY_DSID)
return [sanitizeId(row.dsid) for row in qres]
[docs]
def get_all_archiveTypes(self):
'''
Returns a list of all the unique archiveTypes present in the LiPD object
Returns
-------
list
A list of archiveTypes
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
# Load Local files
lipd = LiPD()
lipd.load([
"../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd",
"../examples/data/MD98_2181.Stott.2007.lpd"
])
print(lipd.get_all_archiveTypes())
'''
qres, qres_df = self.query(QUERY_UNIQUE_ARCHIVE_TYPE)
return [str(row.archiveType) for row in qres]
[docs]
def get_all_locations(self, dsname = None):
'''Return geographical coordinates for all the datasets.
Parameters
----------
dsname : str, optional
The name of the dataset for which to return the timeseries information. The default is None.
Returns
-------
df : pandas.DataFrame
A pandas dataframe returning the latitude, longitude and elevation for each dataset
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
df = lipd.get_all_locations()
print(df)
'''
if dsname is None:
dsname= ''
query = QUERY_LOCATION
query = query.replace("[dsname]", dsname)
return self.query(query)[1]
[docs]
def get_all_compilation_names(self):
'''Return the names of the compilation present in the LiPD object
Returns
-------
l : list
A list returning the names of the available compilations.
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Temp12k')
df = lipd.get_all_compilation_names()
print(df)
'''
qres, qres_df = self.query(QUERY_COMPILATION_NAME)
return [sanitizeId(row.compilationName) for row in qres]
[docs]
def get_ensemble_tables(self, dsname = None, ensembleVarName = None, ensembleDepthVarName = 'depth'):
'''Gets ensemble tables from the LiPD graph
Parameters
----------
dsname : str
The name of the dataset if you wish to analyse one at a time (Set to ".*" to match all datasets with a common root)
ensembleVarName : None or str
ensemble variable name. Default is None, which searches for names that contain "year" or "age" (Set to ".*" to match all ensemble variable names)
ensembleDepthVarName : str
ensemble depth variable name. Default is 'depth' (Set to ".*" to match all ensemble depth variable names)
Returns
-------
ensemble_tables : dataframe
A dataframe containing the ensemble tables
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
lipd = LiPD()
lipd.load([
"../examples/data/ODP846.Lawrence.2006.lpd"
])
all_datasets = lipd.get_all_dataset_names()
print("Loaded datasets: " + str(all_datasets))
ens_df = lipd.get_ensemble_tables(
ensembleVarName="age",
ensembleDepthVarName="depth"
)
print(ens_df)
'''
if dsname is None:
dsname = ''
if ensembleVarName is None:
query = QUERY_ENSEMBLE_TABLE_SHORT
query = query.replace("[dsname]", dsname)
query = query.replace("[ensembleDepthVarName]", ensembleDepthVarName)
else:
query = QUERY_ENSEMBLE_TABLE
query = query.replace("[dsname]", dsname)
query = query.replace("[ensembleVarName]", ensembleVarName)
query = query.replace("[ensembleDepthVarName]", ensembleDepthVarName)
qres, qres_df = self.query(query)
nan_replace = re.compile(re.escape('NaN'), re.IGNORECASE)
qres_df['ensembleDepthValues']=qres_df['ensembleDepthValues'].apply(lambda row : np.array(json.loads(row)))
qres_df['ensembleVariableValues']=qres_df['ensembleVariableValues'].apply(lambda row : np.array(ast.literal_eval(nan_replace.sub('None', row))))
return qres_df
[docs]
def get_all_variables(self):
'''
Returns a list of all variables in the graph
Returns
-------
pandas.DataFrame
A dataframe of all variables in the graph with columns uri, varid, varname
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
lipd = LiPD()
lipd.load([
"../examples/data/ODP846.Lawrence.2006.lpd"
])
df = lipd.get_all_variables()
print(df)
'''
return self.query(QUERY_VARIABLE)[1]
[docs]
def get_all_variable_names(self):
"""
Get a list of all possible distinct variableNames. Useful for filtering and querying.
Returns
-------
list
A list of unique variableName
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
varName = lipd.get_all_variable_names()
print(varName)
"""
return self.query(QUERY_DISTINCT_VARIABLE)[1].iloc[:,0].values.tolist()
[docs]
def get_dataset_properties(self):
"""Get a list of unique properties attached to a dataset.
Note: Some properties will return another object (e.g., 'publishedIn' will give you a Publication object with its own properties)
Note: Not all datasets will have the same available properties (i.e., not filled in by a user)
Returns
-------
clean_list : list
A list of available properties that can queried
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir(name='Pages2k')
dataset_properties = lipd.get_dataset_properties()
print(dataset_properties)
"""
query_list = self.query(QUERY_DATASET_PROPERTIES)[1].iloc[:,0].values.tolist()
clean_list = [item.split("#")[-1] for item in query_list]
return clean_list
[docs]
def get_variable_properties(self):
'''Get a list of variable properties that can be used for querying
Returns
-------
list
A list of unique variable properties
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir(name='Pages2k')
variable_properties = lipd.get_variable_properties()
print(variable_properties)
'''
query_list = self.query(QUERY_VARIABLE_PROPERTIES)[1].iloc[:,0].values.tolist()
clean_list = [item.split("#")[-1] for item in query_list]
return clean_list
[docs]
def get_model_properties(self):
'''Get all the properties associated with a model
Returns
-------
List
A list of unique properties attached to models
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_datasets
lipd = load_datasets(names='ODP846')
model_properties = lipd.get_model_properties()
print(model_properties)
'''
query_list = self.query(QUERY_MODEL_PROPERTIES)[1].iloc[:,0].values.tolist()
clean_list = [item.split("#")[-1] for item in query_list]
return clean_list
[docs]
def to_lipd_series(self, parallel=False):
'''
Converts the LiPD object to a LiPDSeries object
Parameters
----------
parallel : bool
Whether to use parallel processing to load the data. Default is False
Returns
-------
pylipd.lipd.LiPDSeries
A LiPDSeries object
Examples
--------
.. jupyter-execute::
from pylipd.lipd import LiPD
lipd = LiPD()
lipd.load([
"../examples/data/ODP846.Lawrence.2006.lpd"
])
S = lipd.to_lipd_series()
'''
S = LiPDSeries()
S.load(self, parallel)
return S
# bbox = left,bottom,right,top
# bbox = min Longitude , min Latitude , max Longitude , max Latitude
[docs]
def filter_by_geo_bbox(self, lonMin, latMin, lonMax, latMax):
'''
Filters datasets to return a new LiPD object that only keeps datasets that fall within the bounding box
Parameters
----------
lonMin : float
Minimum longitude
latMin : float
Minimum latitude
lonMax : float
Maximum longitude
latMax : float
Maximum latitude
Returns
-------
pylipd.lipd.LiPD
A new LiPD object that only contains datasets that fall within the bounding box
Examples
--------
pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method.
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir()
Lfiltered = lipd.filter_by_geo_bbox(0,25,50,50)
Lfiltered.get_all_dataset_names()
'''
query = QUERY_FILTER_GEO
query = query.replace("[lonMin]", str(lonMin))
query = query.replace("[latMin]", str(latMin))
query = query.replace("[lonMax]", str(lonMax))
query = query.replace("[latMax]", str(latMax))
qres, qres_df = self.query(query)
dsnames = [sanitizeId(row.dsname) for row in qres]
return self.get(dsnames)
[docs]
def filter_by_archive_type(self, archiveType):
'''
Filters datasets to return a new LiPD object that only keeps datasets that have the specified archive type
Parameters
----------
archiveType : str
The archive type to filter by
Returns
-------
pylipd.lipd.LiPD
A new LiPD object that only contains datasets that have the specified archive type (regex)
Examples
--------
pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method.
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
Lfiltered = lipd.filter_by_archive_type('marine')
Lfiltered.get_all_archiveTypes()
If searching for multiple archiveTypes, you can construct the name as follows:
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
Lfiltered = lipd.filter_by_archive_type('marine|coral')
Lfiltered.get_all_archiveTypes()
'''
query = QUERY_FILTER_ARCHIVE_TYPE
query = query.replace("[archiveType]", archiveType)
qres, qres_df = self.query(query)
dsnames = [sanitizeId(row.dsname) for row in qres]
return self.get(dsnames)
[docs]
def filter_by_datasetName(self, datasetName):
'''
Filters datasets to return a new LiPD object that only keeps datasets that have the specified names
Parameters
----------
datasetName : str
The datasetNames to filter by
Returns
-------
pylipd.lipd.LiPD
A new LiPD object that only contains datasets that have the specified archive type (regex)
Examples
--------
pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method.
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
Lfiltered = lipd.filter_by_datasetName('Ocn-RedSea.Felis.2000')
Lfiltered.get_all_dataset_names()
If searching for multiple dataset names, you can construct the name as follows:
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
dsnames = ['Ocn-RedSea.Felis.2000','Ant-WAIS-Divide.Severinghaus.2012']
dsquery = '|'.join(dsnames)
Lfiltered = lipd.filter_by_datasetName(dsquery)
Lfiltered.get_all_dataset_names()
'''
query = QUERY_FILTER_DATASET_NAME
query = query.replace("[datasetName]", datasetName)
qres, qres_df = self.query(query)
dsnames = [sanitizeId(row.dsname) for row in qres]
return self.get(dsnames)
[docs]
def filter_by_compilationName(self, compilationName):
'''
Filters datasets to return a new LiPD object that only keeps datasets that have the specific compilation
Parameters
----------
compilationName : str
The name of the compilation to filter by
Returns
-------
pylipd.lipd.LiPD
A new LiPD object that only contains datasets that have the specified archive type (regex)
Examples
--------
pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method.
.. jupyter-execute::
from pylipd.utils.dataset import available_dataset_names, load_datasets
dsList = available_dataset_names()
D = load_datasets(dsList)
Dfiltered = D.filter_by_compilationName('Temp12k')
Dfiltered.get_all_dataset_names()
'''
query = QUERY_FILTER_COMPILATION
query = query.replace("[compilationName]", compilationName)
qres, qres_df = self.query(query)
dsnames = [sanitizeId(row.dataSetName) for row in qres]
return self.get(dsnames)
[docs]
def filter_by_time(self,timeBound, timeBoundType = 'any', recordLength = None):
"""
Filter the records according to a specified time interval and the length of the record within that interval. Note that this function assumes that all records use the same time representation.
If you are unsure about the time representation, you may need to use `.get_timeseries_essentials`.
Parameters
----------
timeBound : list
Minimum and Maximum age value to search for.
timeBoundType : str, optional
The type of querying to perform. Possible values include: "any", "entire", and "entirely".
- any: Overlap any portions of matching datasets (default)
- entirely: are entirely overlapped by matching datasets
- entire: overlap entire matching datasets but dataset can be shorter than the bounds
The default is 'any'.
recordLength : float, optional
The minimum length the record needs to have while matching the ageBound criteria. The default is None.
Raises
------
ValueError
timeBoundType must take the values in ["any", "entire", and "entirely"]
Returns
-------
pylipd.lipd.LiPD
A new LiPD object that only contains datasets that have the specified time interval
Examples
--------
pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method.
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
Lfiltered = lipd.filter_by_time(timeBound=[0,1800])
Lfiltered.get_all_dataset_names()
"""
if timeBound and timeBound[0]>timeBound[1]:
timeBound = [timeBound[1],timeBound[0]]
timeBoundType=timeBoundType.lower()
query = QUERY_FILTER_TIME
__, df = self.query(query)
if recordLength is None:
if timeBoundType == 'entirely':
filter_df = df[(df['minage'] <= timeBound[0]) & (df['maxage'] >= timeBound[1])]
elif timeBoundType == 'entire':
filter_df = df[(df['minage'] >= timeBound[0]) & (df['maxage'] <= timeBound[1])]
elif timeBoundType == 'any':
filter_df = df[(df['minage'] <= timeBound[1])]
else:
raise ValueError("timeBoundType must be in ['any', 'entirely','entire']")
else:
if timeBoundType == 'entirely':
filter_df = df[(df['minage'] <= timeBound[0]) & (df['maxage'] >= timeBound[1]) & (np.abs(df['maxage']-df['minage'])>=recordLength)]
elif timeBoundType == 'entire':
filter_df = df[(df['minage'] >= timeBound[0]) & (df['maxage'] <= timeBound[1]) & (np.abs(df['maxage']-df['minage'])>=recordLength)]
elif timeBoundType == 'any':
filter_df = df[(df['minage'] <= timeBound[1]) & (np.abs(df['minage']-timeBound[1])>=recordLength)]
else:
raise ValueError("timeBoundType must be in ['any', 'entirely','entire']")
dsnames = list(filter_df['dsname'])
return self.get(dsnames)
[docs]
def get_datasets(self) -> 'list[Dataset]':
'''
Return datasets as instances of the Dataset class
Parameters
----------
Returns
-------
list of pylipd.classes.Dataset
A list of Dataset objects
Examples
--------
pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method.
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
lipd.get_datasets()
'''
datasets: list[Dataset] = []
for dsname in self.get_all_dataset_names():
dsuri = NSURL + "/" + dsname
r2j = RDFToJSON(dsuri, self.graph)
data = json.loads(r2j.to_json())
ds = Dataset.from_data(dsuri, data)
datasets.append(ds)
return datasets
def _generate_unique_id(self, prefix='PYD'):
# Generate a random UUID
random_uuid = uuid.uuid4() # Generates a random UUID.
# Convert UUID format to the specific format we need
# UUID is usually in the form '1e2a2846-2048-480b-9ec6-674daef472bd' so we slice and insert accordingly
id_str = str(random_uuid)
formatted_id = f"{prefix}-{id_str[:5]}-{id_str[9:13]}-{id_str[14:18]}-{id_str[19:23]}-{id_str[24:28]}"
return formatted_id
def _fix_missing_ids(self, ds: Dataset):
# Assign variable ids if not present
# Assign datatable csv file name if not present
pd_counter = 0
for pd in ds.getPaleoData():
table_counter = 0
for table in pd.getMeasurementTables():
if not table.getFileName():
table.setFileName(f"paleo{pd_counter}measurement{table_counter}.csv")
for v in table.getVariables():
if not v.getVariableId():
v.setVariableId(self._generate_unique_id(prefix='TS'))
table_counter += 1
pd_counter += 1
chron_counter = 0
for chron in ds.getChronData():
table_counter = 0
for table in chron.getMeasurementTables():
if not table.getFileName():
table.setFileName(f"chron{chron_counter}measurement{table_counter}.csv")
for v in table.getVariables():
if not v.getVariableId():
v.setVariableId(self._generate_unique_id(prefix='TS'))
table_counter += 1
model_counter = 0
for model in chron.getModeledBy():
table_counter = 0
for table in model.getEnsembleTables():
if not table.getFileName():
table.setFileName(f"chron{chron_counter}model{model_counter}ensemble{table_counter}.csv")
for v in table.getVariables():
if not v.getVariableId():
v.setVariableId(self._generate_unique_id(prefix='TS'))
table_counter += 1
model_counter += 1
chron_counter += 1
[docs]
def load_datasets(self, datasets: 'list[Dataset]'):
'''
Loads instances of Dataset class into the LiPD graph
Parameters
----------
list of pylipd.classes.Dataset
A list of Dataset objects
Examples
--------
pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method.
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
dses = lipd.get_datasets()
# Modify the datasets if needed, then write them to the same, or another LiPD object
lipd2 = LiPD()
lipd2.load_datasets(dses)
'''
for ds in datasets:
self._fix_missing_ids(ds)
dsuri = ds.id
j2r = JSONToRDF(self.graph, dsuri)
j2r.load_data_in_graph(ds.to_data())