Source code for pylipd.lipd

"""
The LiPD class describes a `LiPD (Linked Paleo Data) <https://cp.copernicus.org/articles/12/1093/2016/cp-12-1093-2016.html>`_ object. It contains an `RDF <https://www.w3.org/RDF/>`_ Graph which is serialization of the LiPD data into an RDF graph containing terms from the `LiPD Ontology <http://linked.earth/Ontology/release/core/1.2.0/index-en.html>`
How to browse and query LiPD objects is described in a short example below, while `this notebook <https://nbviewer.jupyter.org/github/LinkedEarth/pylipd/blob/master/example_notebooks/pylipd_tutorial.ipynb>`_ demonstrates how to use PyLiPD to view and query LiPD datasets.
"""

import os
import re
import os.path
import tempfile
import pandas as pd
import random
import string
import io
import numpy as np
import ast

from rdflib import ConjunctiveGraph, URIRef
from tqdm import tqdm
from .globals.queries import QUERY_ALL_VARIABLES_GRAPH, QUERY_BIBLIO, QUERY_DSID, QUERY_DSNAME, QUERY_ENSEMBLE_TABLE, QUERY_ENSEMBLE_TABLE_SHORT, QUERY_FILTER_ARCHIVE_TYPE, QUERY_FILTER_GEO, QUERY_VARIABLE, QUERY_VARIABLE_GRAPH, QUERY_UNIQUE_ARCHIVE_TYPE, QUERY_TIMESERIES_ESSENTIALS_CHRON, QUERY_TIMESERIES_ESSENTIALS_PALEO, QUERY_DISTINCT_VARIABLE, QUERY_DATASET_PROPERTIES, QUERY_VARIABLE_PROPERTIES, QUERY_MODEL_PROPERTIES, QUERY_LOCATION
from .lipd_series import LiPDSeries
from .utils.multi_processing import multi_convert_to_rdf, multi_load_lipd
from .utils.rdf_graph import RDFGraph

from .utils.rdf_to_lipd import RDFToLiPD
from .utils.legacy_utils import LiPD_Legacy
from .utils.utils import sanitizeId

#import bibtexparser
#from bibtexparser.bibdatabase import BibDatabase
from doi2bib import crossref
from pybtex.database import BibliographyData, Entry

from .globals.urls import NSURL

[docs]class LiPD(RDFGraph): '''The LiPD class describes a `LiPD (Linked Paleo Data) <https://cp.copernicus.org/articles/12/1093/2016/cp-12-1093-2016.html>`_ object. It contains an `RDF <https://www.w3.org/RDF/>`_ Graph which is serialization of the LiPD data into an RDF graph containing terms from the `LiPD Ontology <http://linked.earth/Ontology/release/core/1.2.0/index-en.html>` How to browse and query LiPD objects is described in a short example below. Examples -------- In this example, we read an online LiPD file and convert it into a time series object dictionary. .. jupyter-execute:: from pylipd.lipd import LiPD lipd = LiPD() lipd.load(["https://lipdverse.org/data/LCf20b99dfe8d78840ca60dfb1f832b9ec/1_0_1//Nunalleq.Ledger.2018.lpd"]) ts_list = lipd.get_timeseries(lipd.get_all_dataset_names()) for dsname, tsos in ts_list.items(): for tso in tsos: if 'paleoData_variableName' in tso: print(dsname+': '+tso['paleoData_variableName']+': '+tso['archiveType']) ''' def __init__(self, graph=None): super().__init__(graph)
[docs] def load_from_dir(self, dir_path, parallel=False, cutoff=None): '''Load LiPD files from a directory Parameters ---------- dir_path : str path to the directory containing lipd files parallel: bool (Optional) set to True to process lipd files in parallel. You *must* run this function under the "__main__" process for this to work cutoff : int (Optional) the maximum number of files to load at once. Examples -------- In this example, we load LiPD files from a directory. .. jupyter-execute:: from pylipd.lipd import LiPD lipd = LiPD() lipd.load_from_dir("../examples/data") print(lipd.get_all_dataset_names()) ''' if not os.path.isdir(dir_path): print(f"Directory {dir_path} does not exist") return lipdfiles = [] for path in os.listdir(dir_path): file_path = os.path.join(dir_path, path) if os.path.isfile(file_path) and path.endswith(".lpd"): lipdfiles.append(file_path) if cutoff: lipdfiles = lipdfiles[0:cutoff] self.load(lipdfiles, parallel)
# Allows loading http locations
[docs] def load(self, lipdfiles, parallel=False): '''Load LiPD files. Parameters ---------- lipdfiles : list of str array of paths to lipd files (the paths could also be urls) parallel: bool (Optional) set to True to process lipd files in parallel. You *must* run this function under the "__main__" process for this to work Examples -------- In this example, we load LiPD files for an array of paths. .. jupyter-execute:: from pylipd.lipd import LiPD lipd = LiPD() lipd.load([ "../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd", "../examples/data/MD98_2181.Stott.2007.lpd", "../examples/data/Ant-WAIS-Divide.Severinghaus.2012.lpd", "https://lipdverse.org/data/LCf20b99dfe8d78840ca60dfb1f832b9ec/1_0_1/Nunalleq.Ledger.2018.lpd" ]) print(lipd.get_all_dataset_names()) ''' if type(lipdfiles) is not list: lipdfiles = [lipdfiles] numfiles = len(lipdfiles) print(f"Loading {numfiles} LiPD files") self.graph = multi_load_lipd(self.graph, lipdfiles, parallel) print("Loaded..")
#def load_from_lipdverse(self, datasetID, version=None):
[docs] def convert_lipd_dir_to_rdf(self, lipd_dir, rdf_file, parallel=False): '''Convert a directory containing LiPD files into a single RDF file (to be used for uploading to Knowledge Bases like GraphDB) Parameters ---------- lipd_dir : str Path to the directory containing lipd files rdf_file : str Path to the output rdf file ''' filemap = {} for path in os.listdir(lipd_dir): fullpath = os.path.join(lipd_dir, path) tmp_rdf_file = tempfile.NamedTemporaryFile().name filemap[fullpath] = tmp_rdf_file print(f"Converting {len(filemap.keys())} LiPD files to RDF..") multi_convert_to_rdf(filemap, parallel) print("Conversion to RDF done..") print("Writing to main RDF file..") with open(rdf_file, "w") as fout: for lipdfile in filemap.keys(): tmp_rdf_file = filemap[lipdfile] if os.path.exists(tmp_rdf_file): fin = open(tmp_rdf_file, "r") data = fin.read(); fin.close() fout.write(data) os.remove(tmp_rdf_file) fout.close() print("Written..")
[docs] def load_remote_datasets(self, dsnames): '''Loads remote datasets into cache if a remote endpoint is set Parameters ---------- dsnames : array array of dataset names Examples -------- .. jupyter-execute:: from pylipd.lipd import LiPD # Fetch LiPD data from remote RDF Graph lipd_remote = LiPD() lipd_remote.set_endpoint("https://linkedearth.graphdb.mint.isi.edu/repositories/LiPDVerse2") lipd_remote.load_remote_datasets(["Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001", "MD98_2181.Stott.2007", "Ant-WAIS-Divide.Severinghaus.2012"]) print(lipd_remote.get_all_dataset_names()) ''' if not self.endpoint: raise Exception("No remote endpoint") if type(dsnames) is not list: dsnames = [dsnames] if dsnames == None or len(dsnames) == 0: raise Exception("No dataset names to cache") dsnamestr = (' '.join('<' + NSURL + "/" + dsname + '>' for dsname in dsnames)) print("Caching datasets from remote endpoint..") qres, qres_df = self.query(f"SELECT ?s ?p ?o ?g WHERE {{ GRAPH ?g {{ ?s ?p ?o }} VALUES ?g {{ {dsnamestr} }} }}", remote=True) # Reinitialize graph # self._initialize_graph() for row in qres: self.graph.add((row.s, row.p, row.o, row.g)) print("Done..")
[docs] def update_remote_datasets(self, dsnames): '''Updates local LiPD Graph for datasets to remote endpoint''' if not self.endpoint: raise Exception("No remote endpoint")
# TODO: Implement this
[docs] def get_bibtex(self, remote = True, save = True, path = 'mybiblio.bib', verbose = False): '''Get BibTeX for loaded datasets Parameters ---------- remote : bool (Optional) If set to True, will return the bibliography by checking against the DOI save : bool (Optional) Whether to save the bibliography to a file path : str (Optional) Path where to save the file verbose : bool (Optional) Whether to print out on the console. Note that this option will turn on automatically if saving to a file fails. Returns ------- bibs : list List of BiBTex entry df : pandas.DataFrame Bibliography information in a Pandas DataFrame Examples -------- .. jupyter-execute:: from pylipd.lipd import LiPD # Fetch LiPD data from remote RDF Graph lipd = LiPD() lipd.load([ "../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd", "../examples/data/MD98_2181.Stott.2007.lpd" ]) print(lipd.get_bibtex(save=False)) ''' def establish_type(pub_type): if pub_type: pub_type = re.sub('-', '', pub_type).lower() else: pub_type = 'misc' if re.match(r".*article.*", pub_type) or re.match(r".*shortcommunication.*", pub_type): pub_type = 'article' elif re.match(r".*chapter.*", pub_type) or re.match(r".*book.*", pub_type): pub_type = 'chapter' elif re.match(r".*report.*", pub_type): pub_type = 'report' else: pub_type = 'misc' return pub_type def make_bib(row): pub_type = establish_type(row['type']) # Create a unique citation ID if not given row = row.fillna("") if row['citeKey'] is None: characters = string.ascii_letters + string.digits citation_key = ''.join(random.choice(characters) for i in range(8)) else: citation_key = row['citeKey'] entries = [] #start creating the list if row['authors']: entries.append(('author', str(row['authors']))) if row['doi']: entries.append(('doi',str(row['doi']))) if row['year']: entries.append(('year',str(row['year']))) if row['pubyear']: entries.append(('year',str(row['pubyear']))) if row['title']: if pub_type == 'article' or pub_type == 'misc': entries.append(('title',str(row['title']))) elif pub_type == 'chapter' or pub_type == 'report': entries.append(('chapter', str(row['title']))) if row['journal']: if pub_type == 'article': entries.append(('journal', str(row['journal']))) if pub_type == 'book': entries.append(('title', str(row['journal']))) if row['volume']: entries.append(('volume', str(row['volume']))) if row['issue']: entries.append(('issue', str(row['issue']))) if row['pages']: entries.append(('pages', str(row['pages']))) if row['publisher']: entries.append(('publisher', str(row['publisher']))) if row['report']: entries.append(('title', str(row['report']))) if row['edition']: entries.append(('edition', str(row['edition']))) if row['institution']: entries.append(('institution',str(row['institution']))) if row['url']: entries.append(('url',str(row['url']))) if row['url2']: entries.append(('url',str(row['url2']))) if pub_type == 'article': bib = BibliographyData({citation_key:Entry('article',entries)}) elif pub_type == 'chapter' or pub_type == 'report': bib = BibliographyData({citation_key:Entry('inbook',entries)}) elif pub_type == 'misc': bib = BibliographyData({citation_key:Entry('misc',entries)}) return bib result, df = self.query(QUERY_BIBLIO) bibs = [] for idx,row in df.iterrows(): if remote == True: try: f = (crossref.get_bib(row['doi'])) if f[0]==True: bibs.append(f[1]) else: print("Cannot find a matching record for the provided DOI, creating the entry manually") bibs.append(make_bib(row).to_string('bibtex')) except: print("Cannot parse the provided DOI, creating the entry manually") bibs.append(make_bib(row).to_string('bibtex')) if save == True: try: with io.open(path, 'w', encoding="utf-8") as bibfile: for bib in bibs: bibfile.write("{}\n".format(bib)) except TypeError: print("Can't save in output file\n") verbose = True if verbose == True: print(bibs) return bibs, df
[docs] def get_timeseries(self, dsnames, to_dataframe=False): '''Get Legacy LiPD like Time Series Object (tso) Parameters ---------- dsnames : list array of dataset id or name strings to_dataframe : bool {True; False} Whether to return a dataframe along the dictionary. Default is False Returns ------- ts : dict A dictionary containing Time Series Object df : Pandas.DataFrame If to_dataframe is set to True, returns a queriable Pandas DataFrame Examples -------- .. jupyter-execute:: from pylipd.lipd import LiPD # Fetch LiPD data from remote RDF Graph lipd_remote = LiPD() lipd_remote.set_endpoint("https://linkedearth.graphdb.mint.isi.edu/repositories/LiPDVerse2") ts_list = lipd_remote.get_timeseries(["Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001", "MD98_2181.Stott.2007", "Ant-WAIS-Divide.Severinghaus.2012"]) for dsname, tsos in ts_list.items(): for tso in tsos: if 'paleoData_variableName' in tso: print(dsname+': '+tso['paleoData_variableName']+': '+tso['archiveType']) ''' if type(dsnames)==str: dsnames=[dsnames] ts = self._get_timeseries(dsnames) if to_dataframe == False: return ts elif to_dataframe == True: dict_list =[] for item in ts.keys(): for dictionary in ts[item]: dict_list.append(dictionary) df = pd.DataFrame.from_dict(dict_list, orient='columns') return ts, df
def _get_timeseries(self, dsnames): timeseries = {} for dsname in dsnames: converter = RDFToLiPD(self.graph) d = converter.convert_to_json(dsname) print("Extracting timeseries from dataset: " + dsname + " ...") if len(d.items()): tss = LiPD_Legacy().extract(d) timeseries[dsname] = tss return timeseries
[docs] def get_timeseries_essentials(self, dsname = None, mode='paleo'): ''' Returns specific properties for timeseries: 'dataSetName', 'archiveType', 'geo_meanLat', 'geo_meanLon', 'geo_meanElev', 'paleoData_variableName', 'paleoData_values', 'paleoData_units', 'paleoData_proxy' (paleo only), 'paleoData_proxyGeneral' (paleo only), 'time_variableName', 'time_values', 'time_units', 'depth_variableName', 'depth_values', 'depth_units' Parameters ---------- dsname : str, optional The name of the dataset for which to return the timeseries information. The default is None. mode : paleo, chron Whether to retrun the information stored in the PaleoMeasurementTable or the ChronMeasurementTable. The default is 'paleo'. Raises ------ ValueError Need to select either 'chron' or 'paleo' Returns ------- qres_df : pandas.DataFrame A pandas dataframe returning the properties in columns for each series stored in a row of the dataframe Example -------- .. jupyter-execute:: from pylipd.utils.dataset import load_datasets lipd = load_datasets('ODP846.Lawrence.2006.lpd') df_paleo = lipd.get_timeseries_essentials(mode='paleo') print(df_paleo) To return the information stored in the ChronTable: .. jupyter-execute:: from pylipd.utils.dataset import load_datasets lipd = load_datasets('ODP846.Lawrence.2006.lpd') df_chron = lipd.get_timeseries_essentials(mode='chron') print(df_chron) ''' if dsname is None: dsname= '' if mode == 'paleo': query = QUERY_TIMESERIES_ESSENTIALS_PALEO query = query.replace("[dsname]", dsname) elif mode == 'chron': query = QUERY_TIMESERIES_ESSENTIALS_CHRON query = query.replace("[dsname]", dsname) else: raise ValueError("The mode should be either 'paleo' or 'chron'") qres, qres_df = self.query(query) try: qres_df['paleoData_values']=qres_df['paleoData_values'].apply(lambda row : np.fromstring(row.strip("[]"), sep=',')) except: qres_df['chronData_values']=qres_df['chronData_values'].apply(lambda row : np.fromstring(row.strip("[]"), sep=',')) qres_df['time_values']=qres_df['time_values'].apply(lambda x : np.fromstring(x.strip("[]"), sep=',') if x is not None else None) qres_df['depth_values']=qres_df['depth_values'].apply(lambda x : np.fromstring(x.strip("[]"), sep=',') if x is not None else None) return qres_df
[docs] def get_lipd(self, dsname): '''Get LiPD json for a dataset Parameters ---------- dsname : str dataset id Returns ------- lipdjson : dict LiPD json Examples -------- .. jupyter-execute:: from pylipd.lipd import LiPD # Load a local LiPD file lipd = LiPD() lipd.load([ "../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd", ]) lipd_json = lipd.get_lipd(lipd.get_all_dataset_names()[0]) print(lipd_json) ''' converter = RDFToLiPD(self.graph) return converter.convert_to_json(dsname)
[docs] def create_lipd(self, dsname, lipdfile): '''Create LiPD file for a dataset Parameters ---------- dsname : str dataset id lipdfile: str path to LiPD file Returns ------- lipdjson : dict LiPD json Examples -------- .. jupyter-execute:: from pylipd.lipd import LiPD # Load a local file lipd = LiPD() lipd.load([ "../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd", ]) dsname = lipd.get_all_dataset_names()[0] lipd.create_lipd(dsname, "test.lpd") ''' converter = RDFToLiPD(self.graph) return converter.convert(dsname, lipdfile)
[docs] def get(self, dsnames): '''Gets dataset(s) from the graph and returns the popped LiPD object Parameters ---------- dsnames : str or list of str dataset name(s) to get. Returns ------- pylipd.lipd.LiPD LiPD object with the retrieved dataset(s) Examples -------- .. jupyter-execute:: from pylipd.lipd import LiPD # Load LiPD files from a local directory lipd = LiPD() lipd.load([ "../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd", "../examples/data/MD98_2181.Stott.2007.lpd" ]) all_datasets = lipd.get_all_dataset_names() print("Loaded datasets: " + str(all_datasets)) ds = lipd.get(all_datasets[0]) print("Got dataset: " + str(ds.get_all_dataset_names())) ''' dsnames = [dsnames] if type(dsnames) is not list else dsnames dsids = [(f"{NSURL}/{dsname}" if not dsname.startswith(NSURL) else dsname) for dsname in dsnames] ds = super().get(dsids) return LiPD(ds.graph)
[docs] def pop(self, dsnames): '''Pops dataset(s) from the graph and returns the popped LiPD object Parameters ---------- dsnames : str or list of str dataset name(s) to be popped. Returns ------- pylipd.lipd.LiPD LiPD object with the popped dataset(s) Examples -------- .. jupyter-execute:: from pylipd.lipd import LiPD # Load local files lipd = LiPD() lipd.load([ "../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd", "../examples/data/MD98_2181.Stott.2007.lpd" ]) all_datasets = lipd.get_all_dataset_names() print("Loaded datasets: " + str(all_datasets)) popped = lipd.pop(all_datasets[0]) print("Loaded datasets after pop: " + str(lipd.get_all_dataset_names())) print("Popped dataset: " + str(popped.get_all_dataset_names())) ''' dsnames = [dsnames] if type(dsnames) is not list else dsnames dsids = [(f"{NSURL}/{dsname}" if not dsname.startswith(NSURL) else dsname) for dsname in dsnames] popped = super().pop(dsids) return LiPD(popped.graph)
[docs] def remove(self, dsnames): '''Removes dataset(s) from the graph Parameters ---------- dsnames : str or list of str dataset name(s) to be removed Examples -------- .. jupyter-execute:: from pylipd.lipd import LiPD # Load local files lipd = LiPD() lipd.load([ "../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd", "../examples/data/MD98_2181.Stott.2007.lpd" ]) all_datasets = lipd.get_all_dataset_names() print("Loaded datasets: " + str(all_datasets)) lipd.remove(all_datasets[0]) print("Loaded datasets after remove: " + str(lipd.get_all_dataset_names())) ''' dsnames = [dsnames] if type(dsnames) is not list else dsnames dsids = [(f"{NSURL}/{dsname}" if not dsname.startswith(NSURL) else dsname) for dsname in dsnames] super().remove(dsids)
[docs] def get_all_dataset_names(self): '''Get all Dataset Names Returns ------- dsnames : list A list of datasetnames Examples -------- .. jupyter-execute:: from pylipd.lipd import LiPD # Load local files lipd = LiPD() lipd.load([ "../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd", "../examples/data/MD98_2181.Stott.2007.lpd" ]) print(lipd.get_all_dataset_names()) ''' qres, qres_df = self.query(QUERY_DSNAME) return [sanitizeId(row.dsname) for row in qres]
[docs] def get_all_dataset_ids(self): '''Get all Dataset ids Returns ------- dsids : list A list of datasetnames Examples -------- .. jupyter-execute:: from pylipd.lipd import LiPD # Load local files lipd = LiPD() lipd.load([ "../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd", "../examples/data/MD98_2181.Stott.2007.lpd" ]) print(lipd.get_all_dataset_ids()) ''' qres, qres_df = self.query(QUERY_DSID) return [sanitizeId(row.dsid) for row in qres]
[docs] def get_all_archiveTypes(self): ''' Returns a list of all the unique archiveTypes present in the LiPD object Returns ------- list A list of archiveTypes Examples -------- .. jupyter-execute:: from pylipd.lipd import LiPD # Load Local files lipd = LiPD() lipd.load([ "../examples/data/Ocn-MadangLagoonPapuaNewGuinea.Kuhnert.2001.lpd", "../examples/data/MD98_2181.Stott.2007.lpd" ]) print(lipd.get_all_archiveTypes()) ''' qres, qres_df = self.query(QUERY_UNIQUE_ARCHIVE_TYPE) return [str(row.archiveType) for row in qres]
[docs] def get_all_locations(self, dsname = None): '''Return geographical coordinates for all the datasets. Parameters ---------- dsname : str, optional The name of the dataset for which to return the timeseries information. The default is None. Returns ------- df : pandas.DataFrame A pandas dataframe returning the latitude, longitude and elevation for each dataset Examples -------- .. jupyter-execute:: from pylipd.utils.dataset import load_dir lipd = load_dir('Pages2k') df = lipd.get_all_locations() print(df) ''' if dsname is None: dsname= '' query = QUERY_LOCATION query = query.replace("[dsname]", dsname) return self.query(query)[1]
[docs] def get_ensemble_tables(self, dsname = None, ensembleVarName = None, ensembleDepthVarName = 'depth'): '''Gets ensemble tables from the LiPD graph Parameters ---------- dsname : str The name of the dataset if you wish to analyse one at a time (Set to ".*" to match all datasets with a common root) ensembleVarName : None or str ensemble variable name. Default is None, which searches for names that contain "year" or "age" (Set to ".*" to match all ensemble variable names) ensembleDepthVarName : str ensemble depth variable name. Default is 'depth' (Set to ".*" to match all ensemble depth variable names) Returns ------- ensemble_tables : dataframe A dataframe containing the ensemble tables Examples -------- .. jupyter-execute:: from pylipd.lipd import LiPD lipd = LiPD() lipd.load([ "../examples/data/ODP846.Lawrence.2006.lpd" ]) all_datasets = lipd.get_all_dataset_names() print("Loaded datasets: " + str(all_datasets)) ens_df = lipd.get_ensemble_tables( ensembleVarName="age", ensembleDepthVarName="depth" ) print(ens_df) ''' if dsname is None: dsname = '' if ensembleVarName is None: query = QUERY_ENSEMBLE_TABLE_SHORT query = query.replace("[dsname]", dsname) query = query.replace("[ensembleDepthVarName]", ensembleDepthVarName) else: query = QUERY_ENSEMBLE_TABLE query = query.replace("[dsname]", dsname) query = query.replace("[ensembleVarName]", ensembleVarName) query = query.replace("[ensembleDepthVarName]", ensembleDepthVarName) qres, qres_df = self.query(query) qres_df['ensembleDepthValues']=qres_df['ensembleDepthValues'].apply(lambda row : np.fromstring(row.strip("[]"), sep=',')) qres_df['ensembleVariableValues']=qres_df['ensembleVariableValues'].apply(lambda row : np.array(ast.literal_eval(row))) return qres_df
[docs] def get_all_variables(self): ''' Returns a list of all variables in the graph Returns ------- pandas.DataFrame A dataframe of all variables in the graph with columns uri, varid, varname Examples -------- .. jupyter-execute:: from pylipd.lipd import LiPD lipd = LiPD() lipd.load([ "../examples/data/ODP846.Lawrence.2006.lpd" ]) df = lipd.get_all_variables() print(df) ''' return self.query(QUERY_VARIABLE)[1]
[docs] def get_all_variable_names(self): """ Get a list of all possible distinct variableNames. Useful for filtering and qeurying. Returns ------- list A list of unique variableName Examples -------- .. jupyter-execute:: from pylipd.utils.dataset import load_dir lipd = load_dir('Pages2k') varName = lipd.get_all_variable_names() print(varName) """ return self.query(QUERY_DISTINCT_VARIABLE)[1].iloc[:,0].values.tolist()
[docs] def get_dataset_properties(self): """Get a list of unique properties attached to a dataset. Note: Some properties will return another object (e.g., 'publishedIn' will give you a Publication object with its own properties) Note: Not all datasets will have the same available properties (i.e., not filled in by a user) Returns ------- clean_list : list A list of avialable properties that can queried Examples -------- .. jupyter-execute:: from pylipd.utils.dataset import load_dir lipd = load_dir(name='Pages2k') dataset_properties = lipd.get_dataset_properties() print(dataset_properties) """ query_list = self.query(QUERY_DATASET_PROPERTIES)[1].iloc[:,0].values.tolist() clean_list = [item.split("#")[-1] for item in query_list] return clean_list
[docs] def get_variable_properties(self): '''Get a list of variable properties that can be used for querying Returns ------- list A list of unique variable properties Examples -------- .. jupyter-execute:: from pylipd.utils.dataset import load_dir lipd = load_dir(name='Pages2k') variable_properties = lipd.get_variable_properties() print(variable_properties) ''' query_list = self.query(QUERY_VARIABLE_PROPERTIES)[1].iloc[:,0].values.tolist() clean_list = [item.split("#")[-1] for item in query_list] return clean_list
[docs] def get_model_properties(self): '''Get all the properties associated with a model Returns ------- List A list of unique properties attached to models Examples -------- .. jupyter-execute:: from pylipd.utils.dataset import load_datasets lipd = load_datasets(names='ODP846') model_properties = lipd.get_model_properties() print(model_properties) ''' query_list = self.query(QUERY_MODEL_PROPERTIES)[1].iloc[:,0].values.tolist() clean_list = [item.split("#")[-1] for item in query_list] return clean_list
[docs] def to_lipd_series(self, parallel=False): ''' Converts the LiPD object to a LiPDSeries object Parameters ---------- parallel : bool Whether to use parallel processing to load the data. Default is False Returns ------- pylipd.lipd.LiPDSeries A LiPDSeries object Examples -------- .. jupyter-execute:: from pylipd.lipd import LiPD lipd = LiPD() lipd.load([ "../examples/data/ODP846.Lawrence.2006.lpd" ]) S = lipd.to_lipd_series() ''' S = LiPDSeries() S.load(self, parallel) return S
# bbox = left,bottom,right,top # bbox = min Longitude , min Latitude , max Longitude , max Latitude
[docs] def filter_by_geo_bbox(self, lonMin, latMin, lonMax, latMax): ''' Filters datasets to return a new LiPD object that only keeps datasets that fall within the bounding box Parameters ---------- lonMin : float Minimum longitude latMin : float Minimum latitude lonMax : float Maximum longitude latMax : float Maximum latitude Returns ------- pylipd.lipd.LiPD A new LiPD object that only contains datasets that fall within the bounding box Examples -------- pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method. .. jupyter-execute:: from pylipd.utils.dataset import load_dir lipd = load_dir() Lfiltered = lipd.filter_by_geo_bbox(0,25,50,50) Lfiltered.get_all_dataset_names() ''' query = QUERY_FILTER_GEO query = query.replace("[lonMin]", str(lonMin)) query = query.replace("[latMin]", str(latMin)) query = query.replace("[lonMax]", str(lonMax)) query = query.replace("[latMax]", str(latMax)) qres, qres_df = self.query(query) dsnames = [sanitizeId(row.dsname) for row in qres] return self.get(dsnames)
[docs] def filter_by_archive_type(self, archiveType): ''' Filters datasets to return a new LiPD object that only keeps datasets that have the specified archive type Parameters ---------- archiveType : str The archive type to filter by Returns ------- pylipd.lipd.LiPD A new LiPD object that only contains datasets that have the specified archive type (regex) Examples -------- pyLipd ships with existing datasets that can be loaded directly through the package. Let's load the Pages2k sample datasets using this method. .. jupyter-execute:: from pylipd.utils.dataset import load_dir lipd = load_dir('Pages2k') Lfiltered = lipd.filter_by_archive_type('marine') Lfiltered.get_all_dataset_names() ''' query = QUERY_FILTER_ARCHIVE_TYPE query = query.replace("[archiveType]", archiveType) qres, qres_df = self.query(query) dsnames = [sanitizeId(row.dsname) for row in qres] return self.get(dsnames)