Unverified Commit 76f3a7b7 authored by Adrian Seyboldt's avatar Adrian Seyboldt Committed by GitHub

Allow specification of dims instead of shape (#3551)

* Allow specification of dims instead of shape

* Add pm.TidyData

* Create coords for pm.Data(ndarray)

* empty commit to trigger CI

* Apply suggestions from code review
Co-authored-by: default avatarAlexandre ANDORRA <andorra.alexandre@gmail.com>

* apply black formatting

* address review comments & formatting

* Add demonstration of named coordinates/dims

* don't require dim names to be identifiers

* sort imports

* raise ShapeError instead of ValueError

* formatting

* robustify Dtype and ShapeError

* Removed TidyData and refined dims and coords implementation

* Changed name of kwarg export_dims and improved docstrings

* Add link to ArviZ in docstrings

* Removed TidyData from __all__

* Polished Data container NB

* Fixed line break in data.py

* Fix inference of coords for dataframes

* Refined Data container NB

* Updated getting started NB with new dims and coords features

* Reran getting started NB

* Blackified NBs

* rerun with ArviZ branch

* use np.shape to be compatible with tuples/lists

* add tests for named coordinate handling

* Extended tests for data container
Co-authored-by: default avatarMichael Osthege <m.osthege@fz-juelich.de>
Co-authored-by: default avatarMichael Osthege <michael.osthege@outlook.com>
Co-authored-by: default avatarAlexandre ANDORRA <andorra.alexandre@gmail.com>
parent 8a8beab6
......@@ -17,6 +17,7 @@
- `pm.LKJCholeskyCov` now automatically computes and returns the unpacked Cholesky decomposition, the correlations and the standard deviations of the covariance matrix (see [#3881](https://github.com/pymc-devs/pymc3/pull/3881)).
- `pm.Data` container can now be used for index variables, i.e with integer data and not only floats (issue [#3813](https://github.com/pymc-devs/pymc3/issues/3813), fixed by [#3925](https://github.com/pymc-devs/pymc3/pull/3925)).
- `pm.Data` container can now be used as input for other random variables (issue [#3842](https://github.com/pymc-devs/pymc3/issues/3842), fixed by [#3925](https://github.com/pymc-devs/pymc3/pull/3925)).
- Allow users to specify coordinates and dimension names instead of numerical shapes when specifying a model. This makes interoperability with ArviZ easier. ([see #3551](https://github.com/pymc-devs/pymc3/pull/3551))
- Plots and Stats API sections now link to ArviZ documentation [#3927](https://github.com/pymc-devs/pymc3/pull/3927)
- Add `SamplerReport` with properties `n_draws`, `t_sampling` and `n_tune` to SMC. `n_tune` is always 0 [#3931](https://github.com/pymc-devs/pymc3/issues/3931).
- SMC-ABC: add option to define summary statistics, allow to sample from more complex models, remove redundant distances [#3940](https://github.com/pymc-devs/pymc3/issues/3940)
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -12,23 +12,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, List, Any
import collections
from copy import copy
import io
import os
import pkgutil
import collections
from typing import Dict, List, Any
import numpy as np
import pandas as pd
import pymc3 as pm
import theano.tensor as tt
import theano
__all__ = [
'get_data',
'GeneratorAdapter',
'Minibatch',
'align_minibatches',
'Data',
"get_data",
"GeneratorAdapter",
"Minibatch",
"align_minibatches",
"Data",
]
......@@ -44,8 +46,8 @@ def get_data(filename):
-------
BytesIO of the data
"""
data_pkg = 'pymc3.examples'
return io.BytesIO(pkgutil.get_data(data_pkg, os.path.join('data', filename)))
data_pkg = "pymc3.examples"
return io.BytesIO(pkgutil.get_data(data_pkg, os.path.join("data", filename)))
class GenTensorVariable(tt.TensorVariable):
......@@ -78,14 +80,14 @@ class GeneratorAdapter:
def __init__(self, generator):
if not pm.vartypes.isgenerator(generator):
raise TypeError('Object should be generator like')
raise TypeError("Object should be generator like")
self.test_value = pm.smartfloatX(copy(next(generator)))
# make pickling potentially possible
self._yielded_test_value = False
self.gen = generator
self.tensortype = tt.TensorType(
self.test_value.dtype,
((False, ) * self.test_value.ndim))
self.test_value.dtype, ((False,) * self.test_value.ndim)
)
# python3 generator
def __next__(self):
......@@ -283,11 +285,20 @@ class Minibatch(tt.TensorVariable):
>>> assert x.eval().shape == (2, 20, 20, 40, 10)
"""
RNG = collections.defaultdict(list) # type: Dict[str, List[Any]]
@theano.configparser.change_flags(compute_test_value='raise')
def __init__(self, data, batch_size=128, dtype=None, broadcastable=None, name='Minibatch',
random_seed=42, update_shared_f=None, in_memory_size=None):
RNG = collections.defaultdict(list) # type: Dict[str, List[Any]]
@theano.configparser.change_flags(compute_test_value="raise")
def __init__(
self,
data,
batch_size=128,
dtype=None,
broadcastable=None,
name="Minibatch",
random_seed=42,
update_shared_f=None,
in_memory_size=None,
):
if dtype is None:
data = pm.smartfloatX(np.asarray(data))
else:
......@@ -295,16 +306,16 @@ class Minibatch(tt.TensorVariable):
in_memory_slc = self.make_static_slices(in_memory_size)
self.shared = theano.shared(data[in_memory_slc])
self.update_shared_f = update_shared_f
self.random_slc = self.make_random_slices(self.shared.shape, batch_size, random_seed)
self.random_slc = self.make_random_slices(
self.shared.shape, batch_size, random_seed
)
minibatch = self.shared[self.random_slc]
if broadcastable is None:
broadcastable = (False, ) * minibatch.ndim
broadcastable = (False,) * minibatch.ndim
minibatch = tt.patternbroadcast(minibatch, broadcastable)
self.minibatch = minibatch
super().__init__(self.minibatch.type, None, None, name=name)
theano.Apply(
theano.compile.view_op,
inputs=[self.minibatch], outputs=[self])
theano.Apply(theano.compile.view_op, inputs=[self.minibatch], outputs=[self])
self.tag.test_value = copy(self.minibatch.tag.test_value)
def rslice(self, total, size, seed):
......@@ -313,11 +324,11 @@ class Minibatch(tt.TensorVariable):
elif isinstance(size, int):
rng = pm.tt_rng(seed)
Minibatch.RNG[id(self)].append(rng)
return (rng
.uniform(size=(size, ), low=0.0, high=pm.floatX(total) - 1e-16)
.astype('int64'))
return rng.uniform(
size=(size,), low=0.0, high=pm.floatX(total) - 1e-16
).astype("int64")
else:
raise TypeError('Unrecognized size type, %r' % size)
raise TypeError("Unrecognized size type, %r" % size)
def __del__(self):
del Minibatch.RNG[id(self)]
......@@ -340,10 +351,10 @@ class Minibatch(tt.TensorVariable):
elif isinstance(i, slice):
slc.append(i)
else:
raise TypeError('Unrecognized size type, %r' % user_size)
raise TypeError("Unrecognized size type, %r" % user_size)
return slc
else:
raise TypeError('Unrecognized size type, %r' % user_size)
raise TypeError("Unrecognized size type, %r" % user_size)
def make_random_slices(self, in_memory_shape, batch_size, default_random_seed):
if batch_size is None:
......@@ -351,6 +362,7 @@ class Minibatch(tt.TensorVariable):
elif isinstance(batch_size, int):
slc = [self.rslice(in_memory_shape[0], batch_size, default_random_seed)]
elif isinstance(batch_size, (list, tuple)):
def check(t):
if t is Ellipsis or t is None:
return True
......@@ -364,12 +376,14 @@ class Minibatch(tt.TensorVariable):
return True
else:
return False
# end check definition
if not all(check(t) for t in batch_size):
raise TypeError('Unrecognized `batch_size` type, expected '
'int or List[int|tuple(size, random_seed)] where '
'size and random seed are both ints, got %r' %
batch_size)
raise TypeError(
"Unrecognized `batch_size` type, expected "
"int or List[int|tuple(size, random_seed)] where "
"size and random seed are both ints, got %r" % batch_size
)
batch_size = [
(i, default_random_seed) if isinstance(i, int) else i
for i in batch_size
......@@ -378,12 +392,14 @@ class Minibatch(tt.TensorVariable):
if Ellipsis in batch_size:
sep = batch_size.index(Ellipsis)
begin = batch_size[:sep]
end = batch_size[sep + 1:]
end = batch_size[sep + 1 :]
if Ellipsis in end:
raise ValueError('Double Ellipsis in `batch_size` is restricted, got %r' %
batch_size)
raise ValueError(
"Double Ellipsis in `batch_size` is restricted, got %r"
% batch_size
)
if len(end) > 0:
shp_mid = shape[sep:-len(end)]
shp_mid = shape[sep : -len(end)]
mid = [tt.arange(s) for s in shp_mid]
else:
mid = []
......@@ -392,23 +408,30 @@ class Minibatch(tt.TensorVariable):
end = []
mid = []
if (len(begin) + len(end)) > len(in_memory_shape.eval()):
raise ValueError('Length of `batch_size` is too big, '
'number of ints is bigger that ndim, got %r'
% batch_size)
raise ValueError(
"Length of `batch_size` is too big, "
"number of ints is bigger that ndim, got %r" % batch_size
)
if len(end) > 0:
shp_end = shape[-len(end):]
shp_end = shape[-len(end) :]
else:
shp_end = np.asarray([])
shp_begin = shape[:len(begin)]
slc_begin = [self.rslice(shp_begin[i], t[0], t[1])
if t is not None else tt.arange(shp_begin[i])
for i, t in enumerate(begin)]
slc_end = [self.rslice(shp_end[i], t[0], t[1])
if t is not None else tt.arange(shp_end[i])
for i, t in enumerate(end)]
shp_begin = shape[: len(begin)]
slc_begin = [
self.rslice(shp_begin[i], t[0], t[1])
if t is not None
else tt.arange(shp_begin[i])
for i, t in enumerate(begin)
]
slc_end = [
self.rslice(shp_end[i], t[0], t[1])
if t is not None
else tt.arange(shp_end[i])
for i, t in enumerate(end)
]
slc = slc_begin + mid + slc_end
else:
raise TypeError('Unrecognized size type, %r' % batch_size)
raise TypeError("Unrecognized size type, %r" % batch_size)
return pm.theanof.ix_(*slc)
def update_shared(self):
......@@ -434,7 +457,7 @@ def align_minibatches(batches=None):
else:
for b in batches:
if not isinstance(b, Minibatch):
raise TypeError('{b} is not a Minibatch')
raise TypeError("{b} is not a Minibatch")
for rng in Minibatch.RNG[id(b)]:
rng.seed()
......@@ -447,8 +470,17 @@ class Data:
----------
name: str
The name for this variable
value
value: {List, np.ndarray, pd.Series, pd.Dataframe}
A value to associate with this variable
dims: {str, tuple of str}, optional, default=None
Dimension names of the random variables (as opposed to the shapes of these
random variables). Use this when `value` is a Pandas Series or DataFrame. The
`dims` will then be the name of the Series / DataFrame's columns. See ArviZ
documentation for more information about dimensions and coordinates:
https://arviz-devs.github.io/arviz/notebooks/Introduction.html
export_index_as_coords: bool, optional, default=False
If True, the `Data` container will try to infer what the coordinates should be
if there is an index in `value`.
Examples
--------
......@@ -479,7 +511,7 @@ class Data:
https://docs.pymc.io/notebooks/data_container.html
"""
def __new__(self, name, value):
def __new__(self, name, value, *, dims=None, export_index_as_coords=False):
if isinstance(value, list):
value = np.array(value)
......@@ -497,10 +529,68 @@ class Data:
# transforms it to something digestible for pymc3
shared_object = theano.shared(pm.model.pandas_to_array(value), name)
if isinstance(dims, str):
dims = (dims,)
if not (dims is None or len(dims) == shared_object.ndim):
raise pm.exceptions.ShapeError(
"Length of `dims` must match the dimensions of the dataset.",
actual=len(dims), expected=shared_object.ndim
)
coords = self.set_coords(model, value, dims)
if export_index_as_coords:
model.add_coords(coords)
# To draw the node for this variable in the graphviz Digraph we need
# its shape.
shared_object.dshape = tuple(shared_object.shape.eval())
if dims is not None:
shape_dims = model.shape_from_dims(dims)
if shared_object.dshape != shape_dims:
raise pm.exceptions.ShapeError(
"Data shape does not match with specified `dims`.",
actual=shared_object.dshape, expected=shape_dims
)
model.add_random_variable(shared_object)
model.add_random_variable(shared_object, dims=dims)
return shared_object
@staticmethod
def set_coords(model, value, dims=None):
coords = {}
# If value is a df or a series, we interpret the index as coords:
if isinstance(value, (pd.Series, pd.DataFrame)):
dim_name = None
if dims is not None:
dim_name = dims[0]
if dim_name is None and value.index.name is not None:
dim_name = value.index.name
if dim_name is not None:
coords[dim_name] = value.index
# If value is a df, we also interpret the columns as coords:
if isinstance(value, pd.DataFrame):
dim_name = None
if dims is not None:
dim_name = dims[1]
if dim_name is None and value.columns.name is not None:
dim_name = value.columns.name
if dim_name is not None:
coords[dim_name] = value.columns
if isinstance(value, np.ndarray) and dims is not None:
if len(dims) != value.ndim:
raise pm.exceptions.ShapeError(
"Invalid data shape. The rank of the dataset must match the "
"length of `dims`.",
actual=value.shape, expected=value.ndim
)
for size, dim in zip(value.shape, dims):
coord = model.coords.get(dim, None)
if coord is None:
coords[dim] = pd.RangeIndex(size, name=dim)
return coords
......@@ -56,17 +56,32 @@ class Distribution:
"a 'with model:' block, or use the '.dist' syntax "
"for a standalone distribution.")
if isinstance(name, string_types):
data = kwargs.pop('observed', None)
cls.data = data
if isinstance(data, ObservedRV) or isinstance(data, FreeRV):
raise TypeError("observed needs to be data but got: {}".format(type(data)))
total_size = kwargs.pop('total_size', None)
dist = cls.dist(*args, **kwargs)
return model.Var(name, dist, data, total_size)
else:
if not isinstance(name, string_types):
raise TypeError("Name needs to be a string but got: {}".format(name))
data = kwargs.pop('observed', None)
cls.data = data
if isinstance(data, ObservedRV) or isinstance(data, FreeRV):
raise TypeError("observed needs to be data but got: {}".format(type(data)))
total_size = kwargs.pop('total_size', None)
dims = kwargs.pop('dims', None)
has_shape = 'shape' in kwargs
shape = kwargs.pop('shape', None)
if dims is not None:
if shape is not None:
raise ValueError("Specify only one of 'dims' or 'shape'")
if isinstance(dims, string_types):
dims = (dims,)
shape = model.shape_from_dims(dims)
# Some distributions do not accept shape=None
if has_shape or shape is not None:
dist = cls.dist(*args, **kwargs, shape=shape)
else:
dist = cls.dist(*args, **kwargs)
return model.Var(name, dist, data, total_size, dims=dims)
def __getnewargs__(self):
return _Unpickling,
......@@ -77,7 +92,7 @@ class Distribution:
return dist
def __init__(self, shape, dtype, testval=None, defaults=(),
transform=None, broadcastable=None):
transform=None, broadcastable=None, dims=None):
self.shape = np.atleast_1d(shape)
if False in (np.floor(self.shape) == self.shape):
raise TypeError("Expected int elements in shape")
......@@ -467,8 +482,10 @@ class DensityDist(Distribution):
)
return samples
else:
raise ValueError("Distribution was not passed any random method "
"Define a custom random method and pass it as kwarg random")
raise ValueError(
"Distribution was not passed any random method. "
"Define a custom random method and pass it as kwarg random"
)
class _DrawValuesContext(metaclass=ContextMeta, context_class='_DrawValuesContext'):
......
......@@ -44,8 +44,12 @@ class ImputationWarning(UserWarning):
class ShapeError(Exception):
"""Error that the shape of a variable is incorrect."""
def __init__(self, message, actual=None, expected=None):
if expected and actual:
if actual is not None and expected is not None:
super().__init__('{} (actual {} != expected {})'.format(message, actual, expected))
elif actual is not None and expected is None:
super().__init__('{} (actual {})'.format(message, actual))
elif actual is None and expected is not None:
super().__init__('{} (expected {})'.format(message, expected))
else:
super().__init__(message)
......@@ -53,7 +57,11 @@ class ShapeError(Exception):
class DtypeError(TypeError):
"""Error that the dtype of a variable is incorrect."""
def __init__(self, message, actual=None, expected=None):
if expected and actual:
if actual is not None and expected is not None:
super().__init__('{} (actual {} != expected {})'.format(message, actual, expected))
elif actual is not None and expected is None:
super().__init__('{} (actual {})'.format(message, actual))
elif actual is None and expected is not None:
super().__init__('{} (expected {})'.format(message, expected))
else:
super().__init__(message)
......@@ -39,11 +39,19 @@ from .util import get_transformed_name
from .exceptions import ImputationWarning
__all__ = [
'Model', 'Factor', 'compilef', 'fn', 'fastfn', 'modelcontext',
'Point', 'Deterministic', 'Potential', 'set_data'
"Model",
"Factor",
"compilef",
"fn",
"fastfn",
"modelcontext",
"Point",
"Deterministic",
"Potential",
"set_data",
]
FlatView = collections.namedtuple('FlatView', 'input, replacements, view')
FlatView = collections.namedtuple("FlatView", "input, replacements, view")
class PyMC3Variable(TensorVariable):
......@@ -70,8 +78,7 @@ class InstanceMethod:
return getattr(self.obj, self.method_name)(*args, **kwargs)
def incorporate_methods(source, destination, methods,
wrapper=None, override=False):
def incorporate_methods(source, destination, methods, wrapper=None, override=False):
"""
Add attributes to a destination object which point to
methods from from a source object.
......@@ -94,9 +101,11 @@ def incorporate_methods(source, destination, methods,
"""
for method in methods:
if hasattr(destination, method) and not override:
raise AttributeError("Cannot add method {!r}".format(method) +
"to destination object as it already exists. "
"To prevent this error set 'override=True'.")
raise AttributeError(
"Cannot add method {!r}".format(method)
+ "to destination object as it already exists. "
"To prevent this error set 'override=True'."
)
if hasattr(source, method):
if wrapper is None:
setattr(destination, method, getattr(source, method))
......@@ -146,14 +155,13 @@ def get_named_nodes_and_relations(graph):
graph, None, ancestors, descendents
)
leaf_dict = {
node.name: node for node, ancestor in ancestors.items()
if len(ancestor) == 0
node.name: node for node, ancestor in ancestors.items() if len(ancestor) == 0
}
return leaf_dict, descendents, ancestors
def _get_named_nodes_and_relations(graph, descendent, descendents, ancestors):
if getattr(graph, 'owner', None) is None: # Leaf node
if getattr(graph, "owner", None) is None: # Leaf node
if graph.name is not None: # Named leaf node
if descendent is not None: # Is None for the first node
try:
......@@ -237,7 +245,8 @@ def build_named_node_tree(graphs):
named_nodes_ancestors[k].update(nna[k])
return leaf_dict, named_nodes_descendents, named_nodes_ancestors
T = TypeVar('T', bound='ContextMeta')
T = TypeVar("T", bound="ContextMeta")
class ContextMeta(type):
......@@ -245,19 +254,20 @@ class ContextMeta(type):
the `with` statement.
"""
def __new__(cls, name, bases, dct, **kargs): # pylint: disable=unused-argument
def __new__(cls, name, bases, dct, **kargs): # pylint: disable=unused-argument
"Add __enter__ and __exit__ methods to the class."
def __enter__(self):
self.__class__.context_class.get_contexts().append(self)
# self._theano_config is set in Model.__new__
if hasattr(self, '_theano_config'):
if hasattr(self, "_theano_config"):
self._old_theano_config = set_theano_conf(self._theano_config)
return self
def __exit__(self, typ, value, traceback): # pylint: disable=unused-argument
def __exit__(self, typ, value, traceback): # pylint: disable=unused-argument
self.__class__.context_class.get_contexts().pop()
# self._theano_config is set in Model.__new__
if hasattr(self, '_old_theano_config'):
if hasattr(self, "_old_theano_config"):
set_theano_conf(self._old_theano_config)
dct[__enter__.__name__] = __enter__
......@@ -271,14 +281,14 @@ class ContextMeta(type):
# FIXME: is there a more elegant way to automatically add methods to the class that
# are instance methods instead of class methods?
def __init__(cls, name, bases, nmspc, context_class: Optional[Type]=None, **kwargs): # pylint: disable=unused-argument
def __init__(
cls, name, bases, nmspc, context_class: Optional[Type] = None, **kwargs
): # pylint: disable=unused-argument
"""Add ``__enter__`` and ``__exit__`` methods to the new class automatically."""
if context_class is not None:
cls._context_class = context_class
super().__init__(name, bases, nmspc)
def get_context(cls, error_if_none=True) -> Optional[T]:
"""Return the most recently pushed context object of type ``cls``
on the stack, or ``None``. If ``error_if_none`` is True (default),
......@@ -286,12 +296,12 @@ class ContextMeta(type):
idx = -1
while True:
try:
candidate = cls.get_contexts()[idx] # type: Optional[T]
candidate = cls.get_contexts()[idx] # type: Optional[T]
except IndexError as e:
# Calling code expects to get a TypeError if the entity
# is unfound, and there's too much to fix.
if error_if_none:
raise TypeError("No %s on context stack"%str(cls))
raise TypeError("No %s on context stack" % str(cls))
return None
return candidate
idx = idx - 1
......@@ -308,14 +318,15 @@ class ContextMeta(type):
# no race-condition here, contexts is a thread-local object
# be sure not to override contexts in a subclass however!
context_class = cls.context_class
assert isinstance(context_class, type), \
"Name of context class, %s was not resolvable to a class"%context_class
if not hasattr(context_class, 'contexts'):
assert isinstance(context_class, type), (
"Name of context class, %s was not resolvable to a class" % context_class
)
if not hasattr(context_class, "contexts"):
context_class.contexts = threading.local()
contexts = context_class.contexts
if not hasattr(contexts, 'stack'):
if not hasattr(contexts, "stack"):
contexts.stack = []
return contexts.stack
......@@ -330,13 +341,16 @@ class ContextMeta(type):
c = getattr(modules[cls.__module__], c)
if isinstance(c, type):
return c
raise ValueError("Cannot resolve context class %s"%c)
raise ValueError("Cannot resolve context class %s" % c)
assert cls is not None
if isinstance(cls._context_class, str):
cls._context_class = resolve_type(cls._context_class)
if not isinstance(cls._context_class, (str, type)):
raise ValueError("Context class for %s, %s, is not of the right type"%\
(cls.__name__, cls._context_class))
raise ValueError(
"Context class for %s, %s, is not of the right type"
% (cls.__name__, cls._context_class)
)
return cls._context_class
# Inherit context class from parent
......@@ -353,7 +367,7 @@ class ContextMeta(type):
return instance
def modelcontext(model: Optional['Model']) -> 'Model':
def modelcontext(model: Optional["Model"]) -> "Model":
"""
Return the given model or, if none was supplied, try to find one in
the context stack.
......@@ -372,6 +386,7 @@ class Factor:
"""Common functionality for objects with a log probability density
associated with them.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
......@@ -432,33 +447,35 @@ class Factor:
@property
def logpt(self):
"""Theano scalar of log-probability of the model"""
if getattr(self, 'total_size', None) is not None:
if getattr(self, "total_size", None) is not None:
logp = self.logp_sum_unscaledt * self.scaling
else:
logp = self.logp_sum_unscaledt
if self.name is not None:
logp.name = '__logp_%s' % self.name
logp.name = "__logp_%s" % self.name
return logp
@property
def logp_nojact(self):
"""Theano scalar of log-probability, excluding jacobian terms."""
if getattr(self, 'total_size', None) is not None:
if getattr(self, "total_size", None) is not None: