804 lines
27 KiB
Python
804 lines
27 KiB
Python
"""
|
||
Parser plugin interface.
|
||
|
||
This module defines the parser plugin interface and contains other
|
||
related parser support code.
|
||
|
||
The module is mainly useful for those wanting to write a parser that
|
||
can plugin to rdflib. If you are wanting to invoke a parser you likely
|
||
want to do so through the Graph class parse method.
|
||
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import codecs
|
||
import os
|
||
import pathlib
|
||
import sys
|
||
from io import BufferedIOBase, BytesIO, RawIOBase, StringIO, TextIOBase, TextIOWrapper
|
||
from typing import (
|
||
IO,
|
||
TYPE_CHECKING,
|
||
Any,
|
||
BinaryIO,
|
||
List,
|
||
Optional,
|
||
TextIO,
|
||
Tuple,
|
||
Union,
|
||
cast,
|
||
)
|
||
from urllib.parse import urljoin
|
||
from urllib.request import Request, url2pathname
|
||
from xml.sax import xmlreader
|
||
|
||
import rdflib.util
|
||
from rdflib import __version__
|
||
from rdflib._networking import _urlopen
|
||
from rdflib.namespace import Namespace
|
||
from rdflib.term import URIRef
|
||
|
||
if TYPE_CHECKING:
|
||
from email.message import Message
|
||
from io import BufferedReader
|
||
from urllib.response import addinfourl
|
||
|
||
from typing_extensions import Buffer
|
||
|
||
from rdflib.graph import Graph
|
||
|
||
|
||
__all__ = [
|
||
"Parser",
|
||
"InputSource",
|
||
"StringInputSource",
|
||
"URLInputSource",
|
||
"FileInputSource",
|
||
"PythonInputSource",
|
||
]
|
||
|
||
|
||
class Parser:
|
||
__slots__ = ()
|
||
|
||
def __init__(self):
|
||
pass
|
||
|
||
def parse(self, source: InputSource, sink: Graph) -> None:
|
||
pass
|
||
|
||
|
||
class BytesIOWrapper(BufferedIOBase):
|
||
__slots__ = (
|
||
"wrapped",
|
||
"enc_str",
|
||
"text_str",
|
||
"encoding",
|
||
"encoder",
|
||
"has_read1",
|
||
"has_seek",
|
||
"_name",
|
||
"_fileno",
|
||
"_isatty",
|
||
"_leftover",
|
||
"_bytes_per_char",
|
||
"_text_bytes_offset",
|
||
)
|
||
|
||
def __init__(self, wrapped: Union[str, StringIO, TextIOBase], encoding="utf-8"):
|
||
super(BytesIOWrapper, self).__init__()
|
||
self.wrapped = wrapped
|
||
self.encoding = encoding
|
||
self.encoder = codecs.getencoder(self.encoding)
|
||
self.enc_str: Optional[Union[BytesIO, BufferedIOBase]] = None
|
||
self.text_str: Optional[Union[StringIO, TextIOBase]] = None
|
||
self.has_read1: Optional[bool] = None
|
||
self.has_seek: Optional[bool] = None
|
||
self._name: Optional[str] = None
|
||
self._fileno: Optional[Union[int, BaseException]] = None
|
||
self._isatty: Optional[Union[bool, BaseException]] = None
|
||
self._leftover: bytes = b""
|
||
self._text_bytes_offset: int = 0
|
||
norm_encoding = encoding.lower().replace("_", "-")
|
||
if norm_encoding in ("utf-8", "utf8", "u8", "cp65001"):
|
||
# utf-8 has a variable number of bytes per character, 1-4
|
||
self._bytes_per_char: int = 1 # assume average of 1 byte per character
|
||
elif norm_encoding in (
|
||
"latin1",
|
||
"latin-1",
|
||
"iso-8859-1",
|
||
"iso8859-1",
|
||
"ascii",
|
||
"us-ascii",
|
||
):
|
||
# these are all 1-byte-per-character encodings
|
||
self._bytes_per_char = 1
|
||
elif norm_encoding.startswith("utf-16") or norm_encoding.startswith("utf16"):
|
||
# utf-16 has a variable number of bytes per character, 2-3
|
||
self._bytes_per_char = 2 # assume average of 2 bytes per character
|
||
elif norm_encoding.startswith("utf-32") or norm_encoding.startswith("utf32"):
|
||
# utf-32 is fixed length with 4 bytes per character
|
||
self._bytes_per_char = 4
|
||
else:
|
||
# not sure, just assume it is 2 bytes per character
|
||
self._bytes_per_char = 2
|
||
|
||
def _init(self):
|
||
name: Optional[str] = None
|
||
if isinstance(self.wrapped, str):
|
||
b, blen = self.encoder(self.wrapped)
|
||
self.enc_str = BytesIO(b)
|
||
name = "string"
|
||
elif isinstance(self.wrapped, TextIOWrapper):
|
||
inner = self.wrapped.buffer
|
||
# type error: TextIOWrapper.buffer cannot be a BytesIOWrapper
|
||
if isinstance(inner, BytesIOWrapper): # type: ignore[unreachable]
|
||
raise Exception(
|
||
"BytesIOWrapper cannot be wrapped in TextIOWrapper, "
|
||
"then wrapped in another BytesIOWrapper"
|
||
)
|
||
else:
|
||
self.enc_str = cast(BufferedIOBase, inner)
|
||
elif isinstance(self.wrapped, (TextIOBase, StringIO)):
|
||
self.text_str = self.wrapped
|
||
use_stream: Union[BytesIO, StringIO, BufferedIOBase, TextIOBase]
|
||
if self.enc_str is not None:
|
||
use_stream = self.enc_str
|
||
elif self.text_str is not None:
|
||
use_stream = self.text_str
|
||
else:
|
||
raise Exception("No stream to read from")
|
||
if name is None:
|
||
try:
|
||
name = use_stream.name # type: ignore[union-attr]
|
||
except AttributeError:
|
||
name = "stream"
|
||
self.has_read1 = hasattr(use_stream, "read1")
|
||
try:
|
||
self.has_seek = use_stream.seekable()
|
||
except AttributeError:
|
||
self.has_seek = hasattr(use_stream, "seek")
|
||
|
||
self._name = name
|
||
|
||
def _check_fileno(self):
|
||
use_stream: Union[BytesIO, StringIO, BufferedIOBase, TextIOBase]
|
||
if self.enc_str is None and self.text_str is None:
|
||
self._init()
|
||
if self.enc_str is not None:
|
||
use_stream = self.enc_str
|
||
elif self.text_str is not None:
|
||
use_stream = self.text_str
|
||
try:
|
||
self._fileno = use_stream.fileno()
|
||
except OSError as e:
|
||
self._fileno = e
|
||
except AttributeError:
|
||
self._fileno = -1
|
||
|
||
def _check_isatty(self):
|
||
use_stream: Union[BytesIO, StringIO, BufferedIOBase, TextIOBase]
|
||
if self.enc_str is None and self.text_str is None:
|
||
self._init()
|
||
if self.enc_str is not None:
|
||
use_stream = self.enc_str
|
||
elif self.text_str is not None:
|
||
use_stream = self.text_str
|
||
try:
|
||
self._isatty = use_stream.isatty()
|
||
except OSError as e:
|
||
self._isatty = e
|
||
except AttributeError:
|
||
self._isatty = False
|
||
|
||
@property
|
||
def name(self) -> Any:
|
||
if self._name is None:
|
||
self._init()
|
||
return self._name
|
||
|
||
@property
|
||
def closed(self) -> bool:
|
||
if self.enc_str is None and self.text_str is None:
|
||
return False
|
||
closed: Optional[bool] = None
|
||
if self.enc_str is not None:
|
||
try:
|
||
closed = self.enc_str.closed
|
||
except AttributeError:
|
||
closed = None
|
||
elif self.text_str is not None:
|
||
try:
|
||
closed = self.text_str.closed
|
||
except AttributeError:
|
||
closed = None
|
||
return False if closed is None else closed
|
||
|
||
def readable(self) -> bool:
|
||
return True
|
||
|
||
def writable(self) -> bool:
|
||
return False
|
||
|
||
def truncate(self, size: Optional[int] = None) -> int:
|
||
raise NotImplementedError("Cannot truncate on BytesIOWrapper")
|
||
|
||
def isatty(self) -> bool:
|
||
if self._isatty is None:
|
||
self._check_isatty()
|
||
if isinstance(self._isatty, BaseException):
|
||
raise self._isatty
|
||
else:
|
||
return bool(self._isatty)
|
||
|
||
def fileno(self) -> int:
|
||
if self._fileno is None:
|
||
self._check_fileno()
|
||
if isinstance(self._fileno, BaseException):
|
||
raise self._fileno
|
||
else:
|
||
return -1 if self._fileno is None else self._fileno
|
||
|
||
def close(self):
|
||
if self.enc_str is None and self.text_str is None:
|
||
return
|
||
if self.enc_str is not None:
|
||
try:
|
||
self.enc_str.close()
|
||
except AttributeError:
|
||
pass
|
||
elif self.text_str is not None:
|
||
try:
|
||
self.text_str.close()
|
||
except AttributeError:
|
||
pass
|
||
|
||
def flush(self):
|
||
return # Does nothing on read-only streams
|
||
|
||
def _read_bytes_from_text_stream(self, size: Optional[int] = -1, /) -> bytes:
|
||
if TYPE_CHECKING:
|
||
assert self.text_str is not None
|
||
if size is None or size < 0:
|
||
try:
|
||
ret_str: str = self.text_str.read()
|
||
except EOFError:
|
||
ret_str = ""
|
||
ret_encoded, enc_len = self.encoder(ret_str)
|
||
if self._leftover:
|
||
ret_bytes = self._leftover + ret_encoded
|
||
self._leftover = b""
|
||
else:
|
||
ret_bytes = ret_encoded
|
||
elif size == len(self._leftover):
|
||
ret_bytes = self._leftover
|
||
self._leftover = b""
|
||
elif size < len(self._leftover):
|
||
ret_bytes = self._leftover[:size]
|
||
self._leftover = self._leftover[size:]
|
||
else:
|
||
d, m = divmod(size, self._bytes_per_char)
|
||
get_per_loop = int(d) + (1 if m > 0 else 0)
|
||
got_bytes: bytes = self._leftover
|
||
while len(got_bytes) < size:
|
||
try:
|
||
got_str: str = self.text_str.read(get_per_loop)
|
||
except EOFError:
|
||
got_str = ""
|
||
if len(got_str) < 1:
|
||
break
|
||
ret_encoded, enc_len = self.encoder(got_str)
|
||
got_bytes += ret_encoded
|
||
if len(got_bytes) == size:
|
||
self._leftover = b""
|
||
ret_bytes = got_bytes
|
||
else:
|
||
ret_bytes = got_bytes[:size]
|
||
self._leftover = got_bytes[size:]
|
||
del got_bytes
|
||
self._text_bytes_offset += len(ret_bytes)
|
||
return ret_bytes
|
||
|
||
def read(self, size: Optional[int] = -1, /) -> bytes:
|
||
"""
|
||
Read at most size bytes, returned as a bytes object.
|
||
|
||
If the size argument is negative or omitted read until EOF is reached.
|
||
Return an empty bytes object if already at EOF.
|
||
"""
|
||
if size is not None and size == 0:
|
||
return b""
|
||
if self.enc_str is None and self.text_str is None:
|
||
self._init()
|
||
if self.enc_str is not None:
|
||
ret_bytes = self.enc_str.read(size)
|
||
else:
|
||
ret_bytes = self._read_bytes_from_text_stream(size)
|
||
return ret_bytes
|
||
|
||
def read1(self, size: Optional[int] = -1, /) -> bytes:
|
||
"""
|
||
Read at most size bytes, with at most one call to the underlying raw stream’s
|
||
read() or readinto() method. Returned as a bytes object.
|
||
|
||
If the size argument is negative or omitted, read until EOF is reached.
|
||
Return an empty bytes object at EOF.
|
||
"""
|
||
if (self.enc_str is None and self.text_str is None) or self.has_read1 is None:
|
||
self._init()
|
||
if not self.has_read1:
|
||
raise NotImplementedError()
|
||
if self.enc_str is not None:
|
||
if size is None or size < 0:
|
||
return self.enc_str.read1()
|
||
return self.enc_str.read1(size)
|
||
raise NotImplementedError("read1() not supported for TextIO in BytesIOWrapper")
|
||
|
||
def readinto(self, b: Buffer, /) -> int:
|
||
"""
|
||
Read len(b) bytes into buffer b.
|
||
|
||
Returns number of bytes read (0 for EOF), or error if the object
|
||
is set not to block and has no data to read.
|
||
"""
|
||
if TYPE_CHECKING:
|
||
assert isinstance(b, (memoryview, bytearray))
|
||
if len(b) == 0:
|
||
return 0
|
||
if self.enc_str is None and self.text_str is None:
|
||
self._init()
|
||
if self.enc_str is not None:
|
||
return self.enc_str.readinto(b)
|
||
else:
|
||
size = len(b)
|
||
read_data: bytes = self._read_bytes_from_text_stream(size)
|
||
read_len = len(read_data)
|
||
if read_len == 0:
|
||
return 0
|
||
b[:read_len] = read_data
|
||
return read_len
|
||
|
||
def readinto1(self, b: Buffer, /) -> int:
|
||
"""
|
||
Read len(b) bytes into buffer b, with at most one call to the underlying raw
|
||
stream's read() or readinto() method.
|
||
|
||
Returns number of bytes read (0 for EOF), or error if the object
|
||
is set not to block and has no data to read.
|
||
"""
|
||
if TYPE_CHECKING:
|
||
assert isinstance(b, (memoryview, bytearray))
|
||
if (self.enc_str is None and self.text_str is None) or self.has_read1 is None:
|
||
self._init()
|
||
if not self.has_read1:
|
||
raise NotImplementedError()
|
||
if self.enc_str is not None:
|
||
return self.enc_str.readinto1(b)
|
||
raise NotImplementedError(
|
||
"readinto1() not supported for TextIO in BytesIOWrapper"
|
||
)
|
||
|
||
def seek(self, offset: int, whence: int = 0, /) -> int:
|
||
if self.has_seek is not None and not self.has_seek:
|
||
raise NotImplementedError()
|
||
if (self.enc_str is None and self.text_str is None) or self.has_seek is None:
|
||
self._init()
|
||
|
||
if not whence == 0:
|
||
raise NotImplementedError("Only SEEK_SET is supported on BytesIOWrapper")
|
||
if offset != 0:
|
||
raise NotImplementedError(
|
||
"Only seeking to zero is supported on BytesIOWrapper"
|
||
)
|
||
if self.enc_str is not None:
|
||
self.enc_str.seek(offset, whence)
|
||
elif self.text_str is not None:
|
||
self.text_str.seek(offset, whence)
|
||
self._text_bytes_offset = 0
|
||
self._leftover = b""
|
||
return 0
|
||
|
||
def seekable(self):
|
||
if (self.enc_str is None and self.text_str is None) or self.has_seek is None:
|
||
self._init()
|
||
return self.has_seek
|
||
|
||
def tell(self) -> int:
|
||
if self.has_seek is not None and not self.has_seek:
|
||
raise NotImplementedError("Cannot tell() pos because file is not seekable.")
|
||
if self.enc_str is not None:
|
||
try:
|
||
self._text_bytes_offset = self.enc_str.tell()
|
||
except AttributeError:
|
||
pass
|
||
return self._text_bytes_offset
|
||
|
||
def write(self, b, /):
|
||
raise NotImplementedError("Cannot write to a BytesIOWrapper")
|
||
|
||
|
||
class InputSource(xmlreader.InputSource):
|
||
"""
|
||
TODO:
|
||
"""
|
||
|
||
def __init__(self, system_id: Optional[str] = None):
|
||
xmlreader.InputSource.__init__(self, system_id=system_id)
|
||
self.content_type: Optional[str] = None
|
||
self.auto_close = False # see Graph.parse(), true if opened by us
|
||
|
||
def close(self) -> None:
|
||
c = self.getCharacterStream()
|
||
if c and hasattr(c, "close"):
|
||
try:
|
||
c.close()
|
||
except Exception:
|
||
pass
|
||
f = self.getByteStream()
|
||
if f and hasattr(f, "close"):
|
||
try:
|
||
f.close()
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
class PythonInputSource(InputSource):
|
||
"""
|
||
Constructs an RDFLib Parser InputSource from a Python data structure,
|
||
for example, loaded from JSON with json.load or json.loads:
|
||
|
||
>>> import json
|
||
>>> as_string = \"\"\"{
|
||
... "@context" : {"ex" : "http://example.com/ns#"},
|
||
... "@graph": [{"@type": "ex:item", "@id": "#example"}]
|
||
... }\"\"\"
|
||
>>> as_python = json.loads(as_string)
|
||
>>> source = create_input_source(data=as_python)
|
||
>>> isinstance(source, PythonInputSource)
|
||
True
|
||
"""
|
||
|
||
def __init__(self, data: Any, system_id: Optional[str] = None):
|
||
self.content_type = None
|
||
self.auto_close = False # see Graph.parse(), true if opened by us
|
||
self.public_id: Optional[str] = None
|
||
self.system_id: Optional[str] = system_id
|
||
self.data = data
|
||
|
||
def getPublicId(self) -> Optional[str]: # noqa: N802
|
||
return self.public_id
|
||
|
||
def setPublicId(self, public_id: Optional[str]) -> None: # noqa: N802
|
||
self.public_id = public_id
|
||
|
||
def getSystemId(self) -> Optional[str]: # noqa: N802
|
||
return self.system_id
|
||
|
||
def setSystemId(self, system_id: Optional[str]) -> None: # noqa: N802
|
||
self.system_id = system_id
|
||
|
||
def close(self) -> None:
|
||
self.data = None
|
||
|
||
|
||
class StringInputSource(InputSource):
|
||
"""
|
||
Constructs an RDFLib Parser InputSource from a Python String or Bytes
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
value: Union[str, bytes],
|
||
encoding: str = "utf-8",
|
||
system_id: Optional[str] = None,
|
||
):
|
||
super(StringInputSource, self).__init__(system_id)
|
||
stream: Union[BinaryIO, TextIO]
|
||
if isinstance(value, str):
|
||
stream = StringIO(value)
|
||
self.setCharacterStream(stream)
|
||
self.setEncoding(encoding)
|
||
b_stream = BytesIOWrapper(value, encoding)
|
||
self.setByteStream(b_stream)
|
||
else:
|
||
stream = BytesIO(value)
|
||
self.setByteStream(stream)
|
||
c_stream = TextIOWrapper(stream, encoding)
|
||
self.setCharacterStream(c_stream)
|
||
self.setEncoding(c_stream.encoding)
|
||
|
||
|
||
headers = {
|
||
"User-agent": "rdflib-%s (https://rdflib.github.io/; eikeon@eikeon.com)"
|
||
% __version__
|
||
}
|
||
|
||
|
||
class URLInputSource(InputSource):
|
||
"""
|
||
Constructs an RDFLib Parser InputSource from a URL to read it from the Web.
|
||
"""
|
||
|
||
links: List[str]
|
||
|
||
@classmethod
|
||
def getallmatchingheaders(cls, message: Message, name) -> List[str]:
|
||
# This is reimplemented here, because the method
|
||
# getallmatchingheaders from HTTPMessage is broken since Python 3.0
|
||
name = name.lower()
|
||
return [val for key, val in message.items() if key.lower() == name]
|
||
|
||
@classmethod
|
||
def get_links(cls, response: addinfourl) -> List[str]:
|
||
linkslines = cls.getallmatchingheaders(response.headers, "Link")
|
||
retarray: List[str] = []
|
||
for linksline in linkslines:
|
||
links = [linkstr.strip() for linkstr in linksline.split(",")]
|
||
for link in links:
|
||
retarray.append(link)
|
||
return retarray
|
||
|
||
def get_alternates(self, type_: Optional[str] = None) -> List[str]:
|
||
typestr: Optional[str] = f'type="{type_}"' if type_ else None
|
||
relstr = 'rel="alternate"'
|
||
alts = []
|
||
for link in self.links:
|
||
parts = [p.strip() for p in link.split(";")]
|
||
if relstr not in parts:
|
||
continue
|
||
if typestr:
|
||
if typestr in parts:
|
||
alts.append(parts[0].strip("<>"))
|
||
else:
|
||
alts.append(parts[0].strip("<>"))
|
||
return alts
|
||
|
||
def __init__(self, system_id: Optional[str] = None, format: Optional[str] = None):
|
||
super(URLInputSource, self).__init__(system_id)
|
||
self.url = system_id
|
||
|
||
# copy headers to change
|
||
myheaders = dict(headers)
|
||
if format == "xml":
|
||
myheaders["Accept"] = "application/rdf+xml, */*;q=0.1"
|
||
elif format == "n3":
|
||
myheaders["Accept"] = "text/n3, */*;q=0.1"
|
||
elif format in ["turtle", "ttl"]:
|
||
myheaders["Accept"] = "text/turtle, application/x-turtle, */*;q=0.1"
|
||
elif format == "nt":
|
||
myheaders["Accept"] = "text/plain, */*;q=0.1"
|
||
elif format == "trig":
|
||
myheaders["Accept"] = "application/trig, */*;q=0.1"
|
||
elif format == "trix":
|
||
myheaders["Accept"] = "application/trix, */*;q=0.1"
|
||
elif format == "json-ld":
|
||
myheaders["Accept"] = (
|
||
"application/ld+json, application/json;q=0.9, */*;q=0.1"
|
||
)
|
||
else:
|
||
# if format not given, create an Accept header from all registered
|
||
# parser Media Types
|
||
from rdflib.parser import Parser
|
||
from rdflib.plugin import plugins
|
||
|
||
acc = []
|
||
for p in plugins(kind=Parser): # only get parsers
|
||
if "/" in p.name: # all Media Types known have a / in them
|
||
acc.append(p.name)
|
||
|
||
myheaders["Accept"] = ", ".join(acc)
|
||
|
||
req = Request(system_id, None, myheaders) # type: ignore[arg-type]
|
||
|
||
response: addinfourl = _urlopen(req)
|
||
self.url = response.geturl() # in case redirections took place
|
||
self.links = self.get_links(response)
|
||
if format in ("json-ld", "application/ld+json"):
|
||
alts = self.get_alternates(type_="application/ld+json")
|
||
for link in alts:
|
||
full_link = urljoin(self.url, link)
|
||
if full_link != self.url and full_link != system_id:
|
||
response = _urlopen(Request(full_link))
|
||
self.url = response.geturl() # in case redirections took place
|
||
break
|
||
|
||
self.setPublicId(self.url)
|
||
content_types = self.getallmatchingheaders(response.headers, "content-type")
|
||
self.content_type = content_types[0] if content_types else None
|
||
if self.content_type is not None:
|
||
self.content_type = self.content_type.split(";", 1)[0]
|
||
self.setByteStream(response)
|
||
# TODO: self.setEncoding(encoding)
|
||
self.response_info = response.info() # a mimetools.Message instance
|
||
|
||
def __repr__(self) -> str:
|
||
# type error: Incompatible return value type (got "Optional[str]", expected "str")
|
||
return self.url # type: ignore[return-value]
|
||
|
||
|
||
class FileInputSource(InputSource):
|
||
def __init__(
|
||
self,
|
||
file: Union[BinaryIO, TextIO, TextIOBase, RawIOBase, BufferedIOBase],
|
||
/,
|
||
encoding: Optional[str] = None,
|
||
):
|
||
base = pathlib.Path.cwd().as_uri()
|
||
system_id = URIRef(pathlib.Path(file.name).absolute().as_uri(), base=base) # type: ignore[union-attr]
|
||
super(FileInputSource, self).__init__(system_id)
|
||
self.file = file
|
||
if isinstance(file, TextIOBase): # Python3 unicode fp
|
||
self.setCharacterStream(file)
|
||
self.setEncoding(file.encoding)
|
||
try:
|
||
b = file.buffer # type: ignore[attr-defined]
|
||
self.setByteStream(b)
|
||
except (AttributeError, LookupError):
|
||
self.setByteStream(BytesIOWrapper(file, encoding=file.encoding))
|
||
else:
|
||
if TYPE_CHECKING:
|
||
assert isinstance(file, BufferedReader)
|
||
self.setByteStream(file)
|
||
if encoding is not None:
|
||
self.setEncoding(encoding)
|
||
self.setCharacterStream(TextIOWrapper(file, encoding=encoding))
|
||
else:
|
||
# We cannot set characterStream here because
|
||
# we do not know the Raw Bytes File encoding.
|
||
pass
|
||
|
||
def __repr__(self) -> str:
|
||
return repr(self.file)
|
||
|
||
|
||
def create_input_source(
|
||
source: Optional[
|
||
Union[IO[bytes], TextIO, InputSource, str, bytes, pathlib.PurePath]
|
||
] = None,
|
||
publicID: Optional[str] = None, # noqa: N803
|
||
location: Optional[str] = None,
|
||
file: Optional[Union[BinaryIO, TextIO]] = None,
|
||
data: Optional[Union[str, bytes, dict]] = None,
|
||
format: Optional[str] = None,
|
||
) -> InputSource:
|
||
"""
|
||
Return an appropriate InputSource instance for the given
|
||
parameters.
|
||
"""
|
||
|
||
# test that exactly one of source, location, file, and data is not None.
|
||
non_empty_arguments = list(
|
||
filter(
|
||
lambda v: v is not None,
|
||
[source, location, file, data],
|
||
)
|
||
)
|
||
|
||
if len(non_empty_arguments) != 1:
|
||
raise ValueError(
|
||
"exactly one of source, location, file or data must be given",
|
||
)
|
||
|
||
input_source = None
|
||
|
||
if source is not None:
|
||
if TYPE_CHECKING:
|
||
assert file is None
|
||
assert data is None
|
||
assert location is None
|
||
if isinstance(source, InputSource):
|
||
input_source = source
|
||
else:
|
||
if isinstance(source, str):
|
||
location = source
|
||
elif isinstance(source, pathlib.PurePath):
|
||
location = str(source)
|
||
elif isinstance(source, bytes):
|
||
data = source
|
||
elif hasattr(source, "read") and not isinstance(source, Namespace):
|
||
f = source
|
||
input_source = InputSource()
|
||
if hasattr(source, "encoding"):
|
||
input_source.setCharacterStream(source)
|
||
input_source.setEncoding(source.encoding)
|
||
try:
|
||
b = source.buffer # type: ignore[union-attr]
|
||
input_source.setByteStream(b)
|
||
except (AttributeError, LookupError):
|
||
input_source.setByteStream(source)
|
||
else:
|
||
input_source.setByteStream(f)
|
||
if f is sys.stdin:
|
||
input_source.setSystemId("file:///dev/stdin")
|
||
elif hasattr(f, "name"):
|
||
input_source.setSystemId(f.name)
|
||
else:
|
||
raise Exception(
|
||
"Unexpected type '%s' for source '%s'" % (type(source), source)
|
||
)
|
||
|
||
absolute_location = None # Further to fix for issue 130
|
||
|
||
auto_close = False # make sure we close all file handles we open
|
||
|
||
if location is not None:
|
||
if TYPE_CHECKING:
|
||
assert file is None
|
||
assert data is None
|
||
assert source is None
|
||
(
|
||
absolute_location,
|
||
auto_close,
|
||
file,
|
||
input_source,
|
||
) = _create_input_source_from_location(
|
||
file=file,
|
||
format=format,
|
||
input_source=input_source,
|
||
location=location,
|
||
)
|
||
|
||
if file is not None:
|
||
if TYPE_CHECKING:
|
||
assert location is None
|
||
assert data is None
|
||
assert source is None
|
||
input_source = FileInputSource(file)
|
||
|
||
if data is not None:
|
||
if TYPE_CHECKING:
|
||
assert location is None
|
||
assert file is None
|
||
assert source is None
|
||
if isinstance(data, dict):
|
||
input_source = PythonInputSource(data)
|
||
auto_close = True
|
||
elif isinstance(data, (str, bytes, bytearray)):
|
||
input_source = StringInputSource(data)
|
||
auto_close = True
|
||
else:
|
||
raise RuntimeError(f"parse data can only str, or bytes. not: {type(data)}")
|
||
|
||
if input_source is None:
|
||
raise Exception("could not create InputSource")
|
||
else:
|
||
input_source.auto_close |= auto_close
|
||
if publicID is not None: # Further to fix for issue 130
|
||
input_source.setPublicId(publicID)
|
||
# Further to fix for issue 130
|
||
elif input_source.getPublicId() is None:
|
||
input_source.setPublicId(absolute_location or "")
|
||
return input_source
|
||
|
||
|
||
def _create_input_source_from_location(
|
||
file: Optional[Union[BinaryIO, TextIO]],
|
||
format: Optional[str],
|
||
input_source: Optional[InputSource],
|
||
location: str,
|
||
) -> Tuple[URIRef, bool, Optional[Union[BinaryIO, TextIO]], Optional[InputSource]]:
|
||
# Fix for Windows problem https://github.com/RDFLib/rdflib/issues/145 and
|
||
# https://github.com/RDFLib/rdflib/issues/1430
|
||
# NOTE: using pathlib.Path.exists on a URL fails on windows as it is not a
|
||
# valid path. However os.path.exists() returns false for a URL on windows
|
||
# which is why it is being used instead.
|
||
if os.path.exists(location):
|
||
location = pathlib.Path(location).absolute().as_uri()
|
||
|
||
base = pathlib.Path.cwd().as_uri()
|
||
|
||
absolute_location = URIRef(rdflib.util._iri2uri(location), base=base)
|
||
|
||
if absolute_location.startswith("file:///"):
|
||
filename = url2pathname(absolute_location.replace("file:///", "/"))
|
||
file = open(filename, "rb")
|
||
else:
|
||
input_source = URLInputSource(absolute_location, format)
|
||
|
||
auto_close = True
|
||
# publicID = publicID or absolute_location # Further to fix
|
||
# for issue 130
|
||
|
||
return absolute_location, auto_close, file, input_source
|