Source code for pylipd.lipd_series

from tqdm import tqdm
from .globals.queries import QUERY_FILTER_VARIABLE_NAME, QUERY_VARIABLE, QUERY_DISTINCT_VARIABLE, QUERY_VARIABLE_ESSENTIALS, QUERY_DISTINCT_PROXY, QUERY_FILTER_VARIABLE_PROXY, QUERY_FILTER_VARIABLE_RESOLUTION, QUERY_LiPDSERIES_PROPERTIES
from .utils.multi_processing import multi_load_lipd_series
from .utils.rdf_graph import RDFGraph

import numpy as np
import json

[docs] class LiPDSeries(RDFGraph): '''The LiPD Series class describes a collection of `LiPD (Linked Paleo Data) <https://cp.copernicus.org/articles/12/1093/2016/cp-12-1093-2016.html>`_ variables. It contains an `RDF <https://www.w3.org/RDF/>`_ Graph which is serialization of LiPD variables into an RDF graph containing terms from the `LiPD Ontology <http://linked.earth/Ontology/release/core/1.2.0/index-en.html>`_. Each LiPD Variable is also associated with the LiPD itself so it can be deserialized into the original LiPD format. How to browse and query the LiPD variables is described in a short example below. Examples -------- In this example, we read an online LiPD file and convert it into a time series object dictionary. .. jupyter-execute:: from pylipd.lipd_series import LiPDSeries lipd = LiPD() lipd.load(["https://lipdverse.org/data/LCf20b99dfe8d78840ca60dfb1f832b9ec/1_0_1//Nunalleq.Ledger.2018.lpd"]) lipd_series = lipd.to_lipd_series() ''' def __init__(self, graph=None): super().__init__(graph) self.lipds = {}
[docs] def load(self, lipd, parallel=False): '''Extract Variables from the LiPD object. Parameters ---------- lipd : LiPD A LiPD object Examples -------- .. jupyter-execute:: from pylipd.lipd_series import LiPDSeries lipd = LiPD() lipd.load(["https://lipdverse.org/data/LCf20b99dfe8d78840ca60dfb1f832b9ec/1_0_1//Nunalleq.Ledger.2018.lpd"]) lipd_series = lipd.to_lipd_series() ''' print(f"Creating LiPD Series...") # Update graph (Create contexts for each variable) print("- Extracting dataset subgraphs") total = len(lipd.get_all_dataset_names()) for ctx in tqdm(lipd.graph.contexts(), total=total): ctxid = str(ctx.identifier) self.lipds[ctxid] = lipd.get(ctxid) multi_load_lipd_series(self.graph, self.lipds, parallel) print("Done..")
[docs] def get_all_variables(self): ''' Returns a list of all variables in the graph Returns ------- pandas.DataFrame A dataframe of all variables in the graph with columns uri, varid, varname Examples -------- .. jupyter-execute:: from pylipd.utils.dataset import load_dir lipd = load_dir() S = lipd.to_lipd_series() df = S.get_all_variables() print(df) ''' return self.query(QUERY_VARIABLE)[1]
[docs] def get_all_variable_names(self): """ Get a list of all possible distinct variableNames. Useful for filtering and querying. Returns ------- list A list of unique variableName Examples -------- .. jupyter-execute:: from pylipd.utils.dataset import load_dir lipd = load_dir('Pages2k') S = lipd.to_lipd_series() varName = S.get_all_variable_names() print(varName) """ return self.query(QUERY_DISTINCT_VARIABLE)[1].iloc[:,0].values.tolist()
[docs] def get_all_proxy(self): """ Get a list of all possible proxy. Useful for filtering and querying. Returns ------- list A list of unique proxies Examples -------- .. jupyter-execute:: from pylipd.utils.dataset import load_dir lipd = load_dir('Pages2k') S = lipd.to_lipd_series() proxyName = S.get_all_proxy() print(proxyName) """ return self.query(QUERY_DISTINCT_PROXY)[1].iloc[:,0].values.tolist()
[docs] def get_timeseries_essentials(self): '''This function returns information about each variable: `dataSetName`, `archiveType`, `name`, `values`, `units`, `TSID`, `proxy`. Returns ------- qres_df : pandas.DataFrame A dataframe containing the information in each column Examples -------- .. jupyter-execute:: from pylipd.utils.dataset import load_dir lipd = load_dir() S = lipd.to_lipd_series() df = S.get_timeseries_essentials() print(df) ''' query = QUERY_VARIABLE_ESSENTIALS qres, qres_df = self.query(query) #fix the dataframe for _,row in qres_df.iterrows(): string = row['dataSetName'].split('/')[-1] row['dataSetName'] = string qres_df['values']=qres_df['values'].apply(lambda row : np.array(json.loads(row))) return qres_df
[docs] def get_variable_properties(self): """ Get a list of all the properties name associated with the dataset. Useful to write custom queries Returns ------- clean_list : list A list of unique variable properties Examples -------- .. jupyter-execute:: from pylipd.utils.dataset import load_dir lipd = load_dir() S = lipd.to_lipd_series() l = S.get_variable_properties() print(l) """ query_list = self.query(QUERY_LiPDSERIES_PROPERTIES)[1].iloc[:,0].values.tolist() clean_list = [item.split("#")[-1] for item in query_list] return clean_list
[docs] def filter_by_name(self, name): ''' Filters series to return a new LiPDSeries that only keeps variables that have the specified name (regex) Parameters ---------- name : str The variable name to filter by Returns ------- pylipd.lipd_series.LiPDSeries A new LiPDSeries object that only contains variables that have the specified name (regex) Examples -------- .. jupyter-execute:: from pylipd.utils.dataset import load_datasets lipd = load_datasets('ODP846.Lawrence.2006.lpd') S = lipd.to_lipd_series() sst = S.filter_by_name('sst') print(sst.get_all_variable_names()) ''' query = QUERY_FILTER_VARIABLE_NAME query = query.replace("[name]", name) qres, qres_df = self.query(query) varuris = [str(row.uri) for row in qres] dsuris = [*set([str(row.dsuri) for row in qres])] #print(len(dsuris)) rdfgraph = self.get(varuris) S = LiPDSeries(rdfgraph.graph) S.lipds = {k: self.lipds[k].copy() for k in dsuris} return S
[docs] def filter_by_proxy(self, proxy): ''' Filters series to return a new LiPDSeries that only keeps variables that have the specified proxy (regex) Parameters ---------- proxy : str The name of the proxy to filter by Returns ------- pylipd.lipd_series.LiPDSeries A new LiPDSeries object that only contains variables that have the specified name (regex) Examples -------- .. jupyter-execute:: from pylipd.utils.dataset import load_dir lipd = load_dir('Pages2k') S = lipd.to_lipd_series() S_filtered = S.filter_by_proxy('ring width') print(S_filtered.get_all_proxy()) ''' query = QUERY_FILTER_VARIABLE_PROXY query = query.replace("[proxy]", proxy) qres, qres_df = self.query(query) varuris = [str(row.uri) for row in qres] dsuris = [*set([str(row.dsuri) for row in qres])] rdfgraph = self.get(varuris) S = LiPDSeries(rdfgraph.graph) S.lipds = {k: self.lipds[k].copy() for k in dsuris} return S
[docs] def filter_by_resolution(self, threshold, stats='Mean'): ''' Filters series to return a new LiPDSeries that only keeps variables that have a resolution less than the specified threshold. Parameters ---------- threshold : float The maximum resolution to keep stats : str, optional Whether to use 'Mean', 'Median', 'Min' or 'Max' resolution. The default is 'Mean'. Raises ------ ValueError Make sure that the stats is of ['Mean','Median', 'Min', 'Max']. Returns ------- S : pylipd.lipd_series.LiPDSeries A new LiPDSeries object that only contains the filtered variables Examples -------- .. jupyter-execute:: from pylipd.utils.dataset import load_dir lipd = load_dir('Pages2k') S = lipd.to_lipd_series() S_filtered = S.filter_by_resolution(10) ''' stats = stats.capitalize() #make sure that the first letter is capitalized stats_allowed = ['Mean','Median', 'Min', 'Max'] #possible values if stats not in stats_allowed: raise ValueError("Stats must be ['Mean','Median', 'Min', 'Max']") threshold = float(threshold) # make sure this is a float or can be coerced in one query = QUERY_FILTER_VARIABLE_RESOLUTION query = query.replace("[value]", str(threshold)) query = query.replace("[stat]", stats) qres,q_df = self.query(query) varuris = [str(row.uri) for row in qres] dsuris = [*set([str(row.dsuri) for row in qres])] rdfgraph = self.get(varuris) S = LiPDSeries(rdfgraph.graph) S.lipds = {k: self.lipds[k].copy() for k in dsuris} return S