Source code for pyllars.string_utils

""" Utilities for working with strings
"""
import logging
logger = logging.getLogger(__name__)

import numpy as np
import pandas as pd
import re
import tqdm

from typing import Iterable, List, Mapping, Optional, Sequence, Union
encoding_map_type = Mapping[str, np.ndarray]
np_array_or_list = Union[np.ndarray,List]

_TRUE_STRING = {'true', 'yes', 't', 'y', '1'}

[docs]def str2bool(s:str) -> bool:
    """ Convert `s` to a boolean value, if possible
    
    Parameters
    ----------
    s : string
        A string which may represent a boolean value
        
    Returns
    -------
    bool_s : boolean
        `True` if `s` is in `_TRUE_STRING`, and `False` otherwise
    """
    return (s.lower() in _TRUE_STRING)

[docs]def try_parse_int(s:str) -> Optional[int]:
    """ Convert `s` to an integer, if possible
    
    Parameters
    ----------
    s : string
        A string which may represent an integer
        
    Returns
    -------
    int_s : int
        An integer
    --- OR ---
    None
        If `s` cannot be parsed into an `int`.
    """
    try:
        return int(s)
    except ValueError:
        return None

[docs]def try_parse_float(s:str) -> Optional[float]:
    """ Convert `s` to a float, if possible
    
    Parameters
    ----------
    s : string
        A string which may represent a float
        
    Returns
    -------
    float_s : float
        A float
    --- OR ---
    None
        If `s` cannot be parsed into a `float`.
    """
    try:
        return float(s)
    except ValueError:
        return None

[docs]def bytes2human(n:int, format:str="%(value)i%(symbol)s") -> str:
    """ Convert `n` bytes to a human-readable format
    
    This code is adapted from: http://goo.gl/zeJZl
    
    Parameters
    ----------
    n : int
        The number of bytes
        
    format : string
        The format string
        
    Returns
    -------
    human_str : string
        A human-readable version of the number of bytes
    
    Examples
    --------
    
    >>> bytes2human(10000)
    '9K'
    >>> bytes2human(100001221)
    '95M'
    
    """
    symbols = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
    prefix = {}
    for i, s in enumerate(symbols[1:]):
        prefix[s] = 1 << (i+1)*10
    for symbol in reversed(symbols[1:]):
        if n >= prefix[symbol]:
            value = float(n) / prefix[symbol]
            return format % locals()
    return format % dict(symbol=symbols[0], value=n)

[docs]def human2bytes(s : str) -> int:
    """ Convert a human-readable byte string to an integer
    
    This code is adapted from: http://goo.gl/zeJZl
    
    Parameters
    ----------
    s : string
        The human-readable byte string
        
    Returns
    -------
    num_bytes : int
        The number of bytes
        
    Examples
    --------
    >>> human2bytes('1M')
    1048576
    >>> human2bytes('1G')
    1073741824
    """
    # first, check if s is already a number
    s_i = try_parse_int(s)
    if s_i is not None:
        return s_i

    symbols = ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
    letter = s[-1:].strip().upper()
    num = s[:-1]
    assert num.isdigit() and letter in symbols
    num = float(num)
    prefix = {symbols[0]:1}
    for i, s in enumerate(symbols[1:]):
        prefix[s] = 1 << (i+1)*10
    return int(num * prefix[letter])

[docs]def encode_all_sequences(
        sequences:Iterable[str],
        encoding_map:encoding_map_type,
        maxlen:Optional[int]=None,
        align:str="start",
        pad_value:str='J',
        same_length:bool=False,
        flatten:bool=False,
        return_as_numpy:bool=True,
        swap_axes:bool=False,
        progress_bar:bool=True) -> np_array_or_list:
    """ Extract the amino acid feature vectors for each peptide sequence

    See `get_peptide_aa_features` for more details.

    Parameters
    ----------
    sequences : typing.Iterable[str]
        The sequences

    encoding_map : typing.Mapping[str, numpy.ndarray]
        The features for each character

    maxlen : typing.Optional[int]

    align : str

    pad_value : str

    same_length : bool

    flatten : bool
        Whether to (attempt to) convert the features of each peptide into
        a single long vector (`True`) or leave as a (presumably) 2d
        position-feature vector.

    return_as_numpy : bool
        Whether to return as a 2d or 3d numpy array (`True`) or a list
        containing 1d or 2d numpy arrays. (The dimensionality depends
        upon `flatten`.)

    swap_axes : bool
        If the values are returned as a numpy tensor, swap axes 1 and 2.

        N.B. This flag is only compatible with `return_as_numpy=True` and
        `flatten=False`.

    progress_bar : bool
        Whether to show a progress bar for collecting the features.

    Returns
    -------
    all_encoded_peptides : typing.Union[numpy.ndarray, typing.List]
        The resulting features. See the `flatten` and `return_as_numpy`
        parameters for the expected output.
    """
    
    if not same_length:
        sequences = pad_trim_sequences(
            sequences,
            maxlen=maxlen,
            align=align,
            pad_value=pad_value
        )
    
    it = sequences
    if progress_bar:
        it = tqdm.tqdm(it)

    all_encoded_sequences = [
        encode_sequence(
            sequence,
            encoding_map,
            flatten=flatten
        ) for sequence in it
    ]

    if return_as_numpy:
        all_encoded_sequences = np.array(all_encoded_sequences)

        #TODO: make the meaning of this more clear
        # Elsewhere, this was referred to as setting the "channel" to "first".
        if swap_axes:
            all_encoded_sequences = np.swapaxes(all_encoded_sequences, 1, 2)

    return all_encoded_sequences

[docs]def encode_sequence(
        sequence:str,
        encoding_map:encoding_map_type,
        flatten:bool=False) -> np.ndarray:
    """ Extract the amino acid properties of the given sequence

    This function is designed with the idea of mapping from a sequence to
    numeric features (such as chemical properties or BLOSUM features for amino
    acid sequences). It may fail if other features are included in
    `encoding_map`.

    Parameters
    ----------
    sequence : str
        The sequence

    encoding_map : typing.Mapping[str, numpy.ndarray]
        A mapping from each character to a set of features. Presumably, the
        features are numpy-like arrays, though they need not be.

    flatten : bool
        Whether to flatten the encoded sequence into a single, 1d array or
        leave them as-is.

    Returns
    -------
    encoded_sequence : numpy.ndarray
        A 1d or 2d np.array, depending on `flatten`. By default
        (`flatten=False`), this is a 1d array of objects, in which the
        outer dimension indexes the position in the epitope. If `flatten`
        is `True`, then the function attempts to reshape the features
        into a single long feature vector. This will likely fail if the
        `encoding_map` values are not numpy-like arrays.
    """

    try:
        encoded_sequence = np.array([
            encoding_map.get(a) for a in sequence
        ])
    except KeyError as ke:
        msg = "Found invalid character. Sequence: {}".format(sequence)
        logger.warning(msg)
        return None

    if flatten:
        encoded_sequence = np.hstack(encoded_sequence)

    return encoded_sequence

[docs]def pad_sequence(
        seq:str,
        max_seq_len:int,
        pad_value:str="J",
        align:str="end") -> str:
    """ Pad `seq` to `max_seq_len` with `value` based on the `align` strategy

    If `seq` is already of length `max_seq_len` *or longer* it will not be
    changed.

    Parameters
    ----------
    seq : str
        The character sequence

    max_seq_len : int
        The maximum length for a sequence

    pad_value : str
        The value for padding. This should be a single character

    align : str
        The strategy for padding the string. Valid options are `start`, `end`,
        and `center`

    Returns
    -------
    padded_seq : str
        The padded string. In case `seq` was already long enough or longer, it
        will not be changed. So `padded_seq` could be longer than `max_seq_len`.
    """
    seq_len = len(seq)
    assert max_seq_len >= seq_len
    if align == "end":
        n_left = max_seq_len - seq_len
        n_right = 0
    elif align == "start":
        n_right = max_seq_len - seq_len
        n_left = 0
    elif align == "center":
        n_left = (max_seq_len - seq_len) // 2 + (max_seq_len - seq_len) % 2
        n_right = (max_seq_len - seq_len) // 2
    else:
        raise ValueError("align can be of: end, start or center")

    # normalize for the length
    n_left = n_left // len(pad_value)
    n_right = n_right // len(pad_value)

    return pad_value * n_left + seq + pad_value * n_right

[docs]def pad_trim_sequences(
        seq_vec:Sequence[str],
        pad_value:str='J',
        maxlen:Optional[int]=None,
        align:str="start") -> List[str]:
    """Pad and/or trim a list of sequences to have common length
    
    The procedure is as follows:
        1. Pad the sequence with `pad_value`
        2. Trim the sequence

    Parameters
    ----------
    seq_vec : typing.Sequence[str]
        List of sequences that can have various lengths

    pad_value : str
        Neutral element with which to pad the sequence. This should be a single
        character.

    maxlen: typing.Optional[int]
        Length of padded/trimmed sequences. If `None`, `maxlen` is set to the
        longest sequence length.

    align : str
        To which end to align the sequences when triming/padding. Valid options
        are `start`, `end`, `center`

    Returns
    -------
    padded_sequences : typing.List[str]
        The padded and/or trimmed sequences
    """    
    if isinstance(seq_vec, str):
        seq_vec = [seq_vec]
    assert isinstance(seq_vec, list) or \
        isinstance(seq_vec, pd.Series) or \
        isinstance(seq_vec, np.ndarray)
    assert isinstance(seq_vec[0], str)
    assert len(pad_value) == 1
    
    max_seq_len = max([len(seq) for seq in seq_vec])
    if maxlen is None:
        maxlen = max_seq_len
    else:
        maxlen = int(maxlen)

    if max_seq_len < maxlen:
        msg = ("Maximum sequence length (%s) is less than maxlen (%s)" % (max_seq_len, maxlen))
        logger.warning(msg)
        max_seq_len = maxlen
        
    # pad and trim
    padded_seq_vec = [
        pad_sequence(
            seq, max(max_seq_len, maxlen), pad_value=pad_value, align=align
        ) for seq in seq_vec
    ]

    padded_seq_vec = [
        trim_sequence(seq, maxlen, align=align)
            for seq in padded_seq_vec
    ]
    
    return padded_seq_vec

[docs]def simple_fill(text:str, width:int=60) -> str:
    """ Split `text` into equal-sized chunks of length `width`
    
    This is a simplified version of `textwrap.fill`.

    The code is adapted from: http://stackoverflow.com/questions/11781261

    Parameters
    ----------
    text : string
        The text to split

    width : int
        The (exact) length of each line after splitting

    Returns
    -------
    split_str : string
        A single string with lines of length `width` (except possibly
        the last line)
    """
    return '\n'.join(text[i:i+width] 
                        for i in range(0, len(text), width))


[docs]def split(s:str, delimiters:Iterable[str], maxsplit:int=0) -> List[str]:
    """ Split `s` on any of the given `delimiters`
    
    This code is adapted from: http://stackoverflow.com/questions/4998629/

    Parameters
    ----------
    s : string
        The string to split
        
    delimiters : list of strings
        The strings to use as delimiters
        
    maxsplit : int
        The maximum number of splits (or 0 for no limit)

    Returns
    -------
    splits : list of strings
        the split strings
    """
    regex_pattern = '|'.join(map(re.escape, delimiters))
    return re.split(regex_pattern, s, maxsplit)

[docs]def trim_sequence(
        seq:str,
        maxlen:int,
        align:str="end") -> str:
    """ Trim `seq` to at most `maxlen` characters using `align` strategy

    Parameters
    ----------
    seq : str
        The (amino acid) sequence

    maxlen : int
        The maximum length

    align : str
        The strategy for trimming the string. Valid options are `start`, `end`,
        and `center`

    Returns
    -------
    trimmed_seq : str
        The trimmed string. In case `seq` was already an appropriate length, it
        will not be changed. So `trimmed_seq` could be shorter than `maxlen`. 
    """
    seq_len = len(seq)

    assert maxlen <= seq_len
    if align == "end":
        return seq[-maxlen:]
    elif align == "start":
        return seq[0:maxlen]
    elif align == "center":
        dl = seq_len - maxlen
        n_left = dl // 2 + dl % 2
        n_right = seq_len - dl // 2
        return seq[n_left:n_right]
    else:
        raise ValueError("align can be of: end, start or center")