Source code for pyllars.collection_utils

"""
This module implements helpers for working with collections. In some cases,
the iterable is restricted to a particular type, such as a list or set.
"""

import logging
logger = logging.getLogger(__name__)

from typing import Any, Callable, Container, Dict, Iterable, List, Mapping, Optional, Sequence, Set, Tuple

import collections
import itertools
import numpy as np
import pandas as pd
import tqdm
import typing

import pyllars.validation_utils as validation_utils


[docs]def apply_no_return(items:Iterable, func:Callable, *args,
        progress_bar:bool=False, total_items:Optional[int]=None, **kwargs) -> None:
    """ Apply `func` to each item in `items`
    
    Unlike :py:func:`map`, this function does not return anything.
    
    Parameters
    ----------
    items : typing.Iterable
        An iterable
        
    func : typing.Callable
        The function to apply to each item

    args
        Positional arguments for `func`.
    
    kwargs
        Keyword arguments to pass to `func`

    progress_bar : bool
        Whether to show a progress bar when waiting for results.

    total_items : int or None
        The number of items in `items`. If not given, `len` is used.
        Presumably, this is used when `items` is a generator and `len` does
        not work.

    Returns
    -------
    None : None
        If a return value is expected, use list comprehension instead.
    """    
    if progress_bar:

        if total_items is not None:
            items = tqdm.tqdm(items, total=total_items)
        else:
            items = tqdm.tqdm(items, total=len(items))

    for i in items:
        func(*(i, *args), **kwargs)
    
    return None



[docs]def flatten_lists(list_of_lists:Iterable) -> List:
    """ Flatten a list of iterables into a single list
    
    This function does not further flatten inner iterables.

    Parameters
    ----------
    list_of_lists : typing.Iterable
        The iterable to flatten

    Returns
    -------
    flattened_list: typing.List
        The flattened list
    """
    return [item for sublist in list_of_lists for item in sublist]

[docs]def is_iterator_exhausted(iterator:Iterable, return_element:bool=False) -> Tuple[bool, Optional[object]]:
    """ Check if the iterator is exhausted

    N.B. THIS CONSUMES THE NEXT ELEMENT OF THE ITERATOR! The `return_element`
    parameter can change this behavior.

    This method is adapted from this SO question: https://stackoverflow.com/questions/661603

    Parameters
    ----------
    iterator : typing.Iterable
        The iterator

    return_element : bool
        Whether to return the next element of the iterator

    Returns
    -------
    is_exhausted : bool
        Whether there was a next element in the iterator

    [optional] next_element : object
        It `return_element` is `True`, then the consumed element is also
        returned.
    """

    # create a flag as the default value for "next"
    flag = object()

    # grab the next thing in the iterator, or our flag if there is nothing
    n = next(iterator, flag)

    # check if we saw the flag or some real value
    is_exhausted = (n == flag)

    # build up the return
    ret = is_exhausted

    if return_element:
        ret = (is_exhausted, n)

    return ret



[docs]def list_insert_list(l:Sequence, to_insert:Sequence, index:int) -> List:
    """ Insert `to_insert` into a shallow copy of `l` at position `index`.
    
    This function is adapted from: http://stackoverflow.com/questions/7376019/
    
    Parameters
    ----------
    l : typing.Sequence
        An iterable 
    
    to_insert : typing.Sequence
        The items to insert
        
    index : int
        The location to begin the insertion
        
    Returns
    -------
    updated_l : typing.List
        A list with `to_insert` inserted into `l` at position `index`
    """

    ret = list(l)
    ret[index:index] = list(to_insert)
    return ret

[docs]def list_remove_list(l:Iterable, to_remove:Container) -> List:
    """ Remove items in `to_remove` from `l`
    
    Note that  "not in" is used to match items in `to_remove`. Additionally,
    the return *is not* lazy.
    
    Parameters
    ----------
    l : typing.Iterable
        An iterable of items
        
    to_remove : typing.Container
        The set of items to remove from `l`
        
    Returns
    -------
    copy_of_l : typing.List
        A shallow copy of `l` without the items in `to_remove`.
    """
    ret = [i for i in l if i not in to_remove]
    return ret

[docs]def list_to_dict(l:Sequence, f:Optional[Callable]=None) -> Dict:
    """ Convert the list to a dictionary in which keys and values are adjacent
    in the list. Optionally, a function `f` can be passed to apply to each value
    before adding it to the dictionary.


    Parameters
    ----------
    l: typing.Sequence
        The list of items

    f: typing.Callable
        A function to apply to each value before inserting it into the list.
        For example, `float` could be passed to convert each value to a float.

    Returns
    -------
    d: typing.Dict
        The dictionary, defined as described above
        
    
    Examples
    --------
    
    .. code-block:: python
    
        l = ["key1", "value1", "key2", "value2"]
        list_to_dict(l, f) == {"key1": f("value1"), "key2": f("value2")}
    """
    if len(l) % 2 != 0:
        msg = ("[collection_utils.list_to_dict]: the list must contain an even number"
            "of elements")
        raise ValueError(msg)

    if f is None:
        f = lambda x: x

    keys = l[::2]
    values = l[1::2]
    d = {k:f(v) for k, v in zip(keys, values)}
    return d


[docs]def remove_nones(l:Iterable, return_np_array:bool=False) -> List:
    """ Remove `None`s from `l`
    
    Compared to other single-function tests, this uses "is" and avoids strange
    behavior with data frames, lists of bools, etc.
    
    This function returns a shallow copy and is not lazy.
    
    N.B. This does not test nested lists. So, for example, a list of lists
    of `None` values would be unchanged by this function.
    
    Parameters
    ----------
    l : typing.Iterable
        The iterable
        
    return_np_array : bool
        If true, the filtered list will be wrapped in an np.array.

    Returns
    -------
    l_no_nones : typing.List
        A list or np.array with the `None`s removed from `l`
    """
    ret = [i for i in l if i is not None]

    if return_np_array:
        ret = np.array(ret)

    return ret

[docs]def replace_none_with_empty_iter(i:Optional[Iterable]) -> Iterable:
    """ Return an empty iterator if `i` is `None`. Otherwise, return `i`.

    The purpose of this function is to make iterating over results from
    functions which return either an iterator or `None` cleaner. This function
    does not verify that `i` is actually an iterator.

    Parameters
    ----------
    i: None or typing.Iterable
        The possibly-empty iterator

    Returns
    -------
    i: typing.Iterable
        An empty list if iterator is None, or the original iterator otherwise
    """
    if i is None:
        return []
    return i



[docs]def wrap_in_list(maybe_sequence:Any) -> Sequence:
    """ If `maybe_sequence` is not a sequence, then wrap it in a list
    
    See :func:`pyllars.validation_utils.is_sequence` for more details about
    what counts as a sequence.

    Parameters
    ----------
    maybe_sequence : typing.Any
        An object which may be a sequence

    Returns
    -------
    list : typing.Sequence
        Either the original object, or `maybe_sequence` wrapped in a list, if
        it was not already a sequence
    """
    ret = maybe_sequence
    
    is_sequence = validation_utils.validate_is_sequence(ret, raise_on_invalid=False)
    
    if not is_sequence:
        ret = [ret]
        
    return ret

[docs]def wrap_string_in_list(maybe_string:Any) -> Sequence:
    """ If `maybe_string` is a string, then wrap it in a list.

    The motivation for this function is that some functions return either a
    single string or multiple strings as a list. The return value of this
    function can be iterated over safely.
    
    This function will fail if `maybe_string` is not a string and it not
    a sequence.

    Parameters
    ----------
    maybe_string : typing.Any
        An object which may be a string

    Returns
    -------
    l : typing.Sequence
        Either the original object, or `maybe_string` wrapped in a list, if
        it was a string}
    """
    
    ret = maybe_string
    if isinstance(maybe_string, str):
        ret = [ret]
        
    validation_utils.validate_is_sequence(ret)
    
    return ret


###
# Set helpers
###


[docs]def wrap_in_set(maybe_set:Optional[Any], wrap_string:bool=True) -> Set:
    """ If `maybe_set` is not a set, then wrap it in a set.
    
    Parameters
    ----------
    maybe_set : typing.Optional[typing.Any]
        An object which may be a set
        
    wrap_string : bool
        Whether to wrap `maybe_set` as a singleton if it is a string.
        Otherwise, the string will be converted into a set of individual
        characters.
        
    Returns
    -------
    s : typing.Set
        Either the original object, or `maybe_set` wrapped in a set, if
        it was not already a set. If `maybe_set` was `None`, then an
        empty set is returned.
    """
    ret = maybe_set
    
    if ret is None:
        ret = set()
        
    # handle strings explicitly
    if isinstance(ret, str):
        if wrap_string:
            ret = set([ret])

    # check if we already have a set-like object
    if not isinstance(ret, collections.abc.Set):

        # if not and it is an iterable
        if isinstance(ret, collections.abc.Iterable):
            # then we can just directly wrap it
            ret = set(ret)
            
        else:
            # otherwise, we must first wrap the object in a list, and
            # then wrap it in the set
            ret = set([ret])
            
    return ret

[docs]def get_set_pairwise_intersections(
        dict_of_sets:Mapping[str,Set],
        return_intersections:bool=True) -> pd.DataFrame:
    """ Find the pairwise intersections among sets in `dict_of_sets`
    
    Parameters
    ----------
    dict_of_sets : typing.Mapping[str,typing.Set]
        A mapping in which the keys are the "names" of the sets and the values
        are the actual sets
        
    return_intersections : bool
        Whether to include the actual set intersections in the return. If `False`,
        then only the intersection size will be included.
        
    Returns
    -------
    df_pairswise_intersections : pandas.DataFrame
        A dataframe with the following columns:
        
        * `set1` : the name of one set in the pair
        * `set2` : the name of the second set in the pair
        * `len(set1)` : the size of set1
        * `len(set2)` : the size of set2
        * `len(intersection)` : the size of the intersection
        * `coverage_small` : the fraction of the smaller of set1 or set2 in the intersection
        * `coverage_large` : the fraction of the larger of set1 or set2 in the intersection
        * `intersection` : the intersection set. Only included if `return_intersections` is `True`.
        
    """
    all_intersection_sizes = []

    it = itertools.combinations(dict_of_sets, 2)

    for i in it:
        s1 = i[0]
        s2 = i[1]
        set1 = dict_of_sets[s1]
        set2 = dict_of_sets[s2]

        intersection = set1 & set2
        
        # determine the coverage of both sets
        coverage_set1 = len(intersection) / len(set1)
        coverage_set2 = len(intersection) / len(set2)
        
        # and set the appropriate "coverage" variables
        if len(set1) > len(set2):
            coverage_small = coverage_set2
            coverage_large = coverage_set1
        else:
            coverage_small = coverage_set1
            coverage_large = coverage_set2
            
        intersection_size = {
            'set1': s1,
            'set2': s2,
            'len(set1)': len(set1),
            'len(set2)': len(set2),
            'len(intersection)': len(intersection),
            'coverage_small': coverage_small,
            'coverage_large': coverage_large
        }

        if return_intersections:
            intersection_size['intersection'] = intersection

        all_intersection_sizes.append(intersection_size)

    df_intersection_sizes = pd.DataFrame(all_intersection_sizes)
    return df_intersection_sizes


[docs]def merge_sets(*set_args:Iterable[Container]) -> Set:
    """ Given any number of sets, merge them into a single set

    N.B. This function only performs a "shallow" merge. It does not handle
    nested containers within the "outer" sets.

    Parameters
    ----------
    set_args: typing.Iterable[typing.Container]
        The sets to merge

    Returns
    -------
    merged_set: typing.Set
        A single set containing unique elements from each of the input sets
    """
    ret = {item for s in set_args for item in s}
    return ret

###
# Dictionary helpers
###
[docs]def reverse_dict(d:Mapping) -> Dict:
    """ Create a new dictionary in which the keys and values of `d` are switched
    
    In the case of duplicate values, it is arbitrary which will be retained.
    
    Parameters
    ----------
    d : typing.Mapping
        The mapping
        
    Returns
    -------
    reversed_d : typing.Dict
        A dictionary in which the values of `d` now map to the keys
    """
    reverse_d = {v:k for k,v in d.items()}    
    return reverse_d


[docs]def sort_dict_keys_by_value(d:Mapping) -> List:
    """ Sort the keys in `d` by their value and return as a list

    This function uses `sorted`, so the values should be able to be sorted
    appropriately by that builtin function.
    
    Parameters
    ----------
    d : typing.Mapping
        The dictionary
        
    Returns
    -------
    sorted_keys : typing.List
        The keys sorted by the associated values
    """
    ret = sorted(d, key=d.get)
    return ret