""" Query a SPARQL endpoint and return results as a Pandas dataframe. """ import io from typing import TYPE_CHECKING, Any, Dict, List, Union from SPARQLWrapper.SmartWrapper import Bindings, SPARQLWrapper2, Value from SPARQLWrapper.Wrapper import CSV, SELECT, SPARQLWrapper if TYPE_CHECKING: import pandas as pd class QueryException(Exception): pass def get_sparql_dataframe_orig( endpoint: str, query: Union[str, bytes] ) -> "pd.DataFrame": """copy paste from: https://github.com/lawlesst/sparql-dataframe""" # pandas inside to avoid requiring it import pandas as pd sparql = SPARQLWrapper(endpoint) sparql.setQuery(query) if sparql.queryType != SELECT: raise QueryException("Only SPARQL SELECT queries are supported.") sparql.setReturnFormat(CSV) results = sparql.query().convert() if isinstance(results, bytes): _csv = io.StringIO(results.decode("utf-8")) return pd.read_csv(_csv, sep=",") else: raise TypeError(type(results)) def get_sparql_typed_dict( endpoint: str, query: Union[str, bytes] ) -> List[Dict[str, Value]]: """modified from: https://github.com/lawlesst/sparql-dataframe""" # pandas inside to avoid requiring it import pandas as pd # rdflib in here because there is some meta stuff in the setup.py and Travis fails because rdflib is installed later import rdflib.term sparql = SPARQLWrapper2(endpoint) sparql.setQuery(query) if sparql.queryType != SELECT: raise QueryException("Only SPARQL SELECT queries are supported.") # sparql.setReturnFormat(JSON) results = sparql.query() if not isinstance(results, Bindings): raise TypeError(type(results)) # consider perf hacking later, probably slow # convert list of dicts to python types d = [] for x in results.bindings: row = {} for k in x: v = x[k] vv = rdflib.term.Literal(v.value, datatype=v.datatype).toPython() # type: ignore[no-untyped-call] row[k] = vv d.append(row) return d def get_sparql_dataframe(endpoint: str, query: Union[str, bytes]) -> "pd.DataFrame": # pandas inside to avoid requiring it import pandas as pd d = get_sparql_typed_dict(endpoint, query) # TODO: will nan fill somehow, make more strict if there is way of getting the nan types from rdflib df = pd.DataFrame(d) return df