from rdflib import ConjunctiveGraph, Namespace, URIRef
from tqdm import tqdm
from .globals.queries import QUERY_FILTER_VARIABLE_NAME, QUERY_VARIABLE, QUERY_DISTINCT_VARIABLE, QUERY_VARIABLE_ESSENTIALS
from pylipd.globals.urls import ONTONS
from .utils.multi_processing import multi_load_lipd_series
from .utils.rdf_graph import RDFGraph
from .utils.utils import sanitizeId
import numpy as np
[docs]class LiPDSeries(RDFGraph):
'''The LiPD Series class describes a collection of `LiPD (Linked Paleo Data) <https://cp.copernicus.org/articles/12/1093/2016/cp-12-1093-2016.html>`_
variables. It contains an `RDF <https://www.w3.org/RDF/>`_ Graph which is serialization of LiPD variables into an RDF graph containing terms from
the `LiPD Ontology <http://linked.earth/Ontology/release/core/1.2.0/index-en.html>`. Each LiPD Variable is also associated with the LiPD itself
so it can be deserialized into the original LiPD format.
How to browse and query the LiPD variables is described in a short example below.
Examples
--------
In this example, we read an online LiPD file and convert it into a time series object dictionary.
.. jupyter-execute::
from pylipd.lipd_series import LiPDSeries
lipd = LiPD()
lipd.load(["https://lipdverse.org/data/LCf20b99dfe8d78840ca60dfb1f832b9ec/1_0_1//Nunalleq.Ledger.2018.lpd"])
lipd_series = lipd.to_lipd_series()
'''
def __init__(self, graph=None):
super().__init__(graph)
self.lipds = {}
[docs] def load(self, lipd, parallel=False):
'''Extract Variables from the LiPD object.
Parameters
----------
lipd : LiPD
A LiPD object
Examples
--------
.. jupyter-execute::
from pylipd.lipd_series import LiPDSeries
lipd = LiPD()
lipd.load(["https://lipdverse.org/data/LCf20b99dfe8d78840ca60dfb1f832b9ec/1_0_1//Nunalleq.Ledger.2018.lpd"])
lipd_series = lipd.to_lipd_series()
'''
print(f"Creating LiPD Series...")
# Update graph (Create contexts for each variable)
print("- Extracting dataset subgraphs")
total = len(lipd.get_all_dataset_names())
for ctx in tqdm(lipd.graph.contexts(), total=total):
ctxid = str(ctx.identifier)
self.lipds[ctxid] = lipd.get(ctxid)
multi_load_lipd_series(self.graph, self.lipds, parallel)
print("Done..")
[docs] def get_all_variables(self):
'''
Returns a list of all variables in the graph
Returns
-------
pandas.DataFrame
A dataframe of all variables in the graph with columns uri, varid, varname
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir()
S = lipd.to_lipd_series()
df = S.get_all_variables()
print(df)
'''
return self.query(QUERY_VARIABLE)[1]
[docs] def get_all_variable_names(self):
"""
Get a list of all possible distinct variableNames. Useful for filtering and qeurying.
Returns
-------
list
A list of unique variableName
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir('Pages2k')
S = lipd.to_lipd_series()
varName = S.get_all_variable_names()
print(varName)
"""
return self.query(QUERY_DISTINCT_VARIABLE)[1].iloc[:,0].values.tolist()
[docs] def get_timeseries_essentials(self):
'''This function returns information about each variable: `dataSetName`, `archiveType`, `name`, `values`, `units`, `TSID`, `proxy`.
Returns
-------
qres_df : pandas.DataFrame
A dataframe containing the information in each column
Examples
--------
.. jupyter-execute::
from pylipd.utils.dataset import load_dir
lipd = load_dir()
S = lipd.to_lipd_series()
df = S.get_timeseries_essentials()
print(df)
'''
query = QUERY_VARIABLE_ESSENTIALS
qres, qres_df = self.query(query)
#fix the dataframe
for _,row in qres_df.iterrows():
string = row['dataSetName'].split('/')[-1]
row['dataSetName'] = string
qres_df['values']=qres_df['values'].apply(lambda row : np.fromstring(row.strip("[]"), sep=','))
return qres_df
[docs] def filter_by_name(self, name):
'''
Filters series to return a new LiPDSeries that only keeps variables that have the specified name (regex)
Parameters
----------
name : str
The variable name to filter by
Returns
-------
pylipd.lipd_series.LiPDSeries
A new LiPDSeries object that only contains variables that have the specified name (regex)
'''
query = QUERY_FILTER_VARIABLE_NAME
query = query.replace("[name]", name)
qres, qres_df = self.query(query)
varuris = [str(row.uri) for row in qres]
dsuris = [*set([str(row.dsuri) for row in qres])]
print(len(dsuris))
rdfgraph = self.get(varuris)
S = LiPDSeries(rdfgraph.graph)
S.lipds = {k: self.lipds[k].copy() for k in dsuris}
return S