Source code for phylo2vec.utils.vector

"""Phylo2Vec vector manipulation functions."""

import random
import warnings

from typing import List, Tuple

import numpy as np

from ete4 import Tree

from phylo2vec import _phylo2vec_core as core
from phylo2vec.base.newick import from_newick, to_newick


[docs] def check_vector(v: np.ndarray) -> None: """Input validation of a Phylo2Vec vector The input is checked to satisfy the Phylo2Vec constraints Parameters ---------- v : numpy.ndarray Phylo2Vec vector """ core.check_v(v.tolist())
[docs] def sample_vector(n_leaves: int, ordered: bool = False) -> np.ndarray: """Sample a random tree via Phylo2Vec, in vector form. Parameters ---------- n_leaves : int Number of leaves (>= 2) ordered : bool, optional If True, sample an ordered tree, by default False True: v_i in {0, 1, ..., i} for i in (0, n_leaves-1) False: v_i in {0, 1, ..., 2*i} for i in (0, n_leaves-1) Returns ------- numpy.ndarray Phylo2Vec vector """ v_list = core.sample_vector(n_leaves, ordered) return np.asarray(v_list)
[docs] def remove_leaf(v, leaf) -> Tuple[np.ndarray, int]: """Remove a leaf from a Phylo2Vec v Parameters ---------- v : numpy.ndarray Phylo2Vec vector leaf : int A leaf node to remove Returns ------- v_sub : numpy.ndarray Phylo2Vec vector without `leaf` sister : int Sister node of leaf """ v_sub, sister = core.remove_leaf(v, leaf) return np.asarray(v_sub), sister
[docs] def add_leaf(v, leaf, pos) -> np.ndarray: """Add a leaf to a Phylo2Vec vector v Parameters ---------- v : numpy.ndarray Phylo2Vec vector leaf : int >= 0 A leaf node to add pos : int >= 0 A branch from where the leaf will be added Returns ------- v_add : numpy.ndarray Phylo2Vec vector including the new leaf """ v_add = core.add_leaf(v, leaf, pos) return np.asarray(v_add)
[docs] def queue_shuffle(v, shuffle_cherries=False) -> Tuple[np.ndarray, List[int]]: """ Produce an ordered version (i.e., birth-death process version) of a Phylo2Vec vector using the Queue Shuffle algorithm. Queue Shuffle ensures that the output tree is ordered, while also ensuring a smooth path through the space of orderings For more details, see https://doi.org/10.1093/gbe/evad213 Illustration of the algorithm: ////-3 ////6| ////7| \\\\-2 | | -8| \\\\-1 | | ////-4 \\\\5| \\\\-0 The ancestry array of this tree is: [8, 7, 5] [7, 6, 1] [6, 3, 2] [5, 4, 0] Unrolled, it becomes: 8 7 5 6 1 3 2 4 0 We encode the nodes as it: Start by encoding the first two non-root nodes as 0, 1 For the next pairs: * The left member takes the label was the previous parent node * The right member increments the previous right member by 1 Ex: 8 7 5 6 1 3 2 4 0 0 1 0 2 then 8 7 5 6 1 3 2 4 0 0 1 0 2 1 3 then 8 7 5 6 1 3 2 4 0 0 1 0 2 1 3 0 4 The code for the leaf nodes (0, 1, 2, 3, 4) is their new label Note that the full algorithm also features a queue of internal nodes which could switch the processing order of rows in the ancestry array. Parameters ---------- v : numpy.ndarray Phylo2Vec vector shuffle_cherries : bool, optional If True, shuffle at random the order of the cherries in the ancestry matrix (i.e., the first two columns of the ancestry matrix). Returns ------- v_new : numpy.ndarray Reordered Phylo2Vec vector vec_mapping : List[int] Mapping of the original vector to the new vector index: leaf value: new leaf index in the reordered vector """ v_new, vec_mapping = core.queue_shuffle(v, shuffle_cherries) return np.asarray(v_new), vec_mapping
def reorder_v(reorder_method, v_old, label_mapping_old, shuffle_cols=False): """Shuffle v by reordering leaf labels Current pipeline: get ancestry matrix --> reorder --> re-build vector Note: reordering functions are not up to date. They will be integrated in the Rust core in the future Parameters ---------- reorder_fun : function Function used to reorder the ancestry matrix v_old : numpy.ndarray or list Current Phylo2vec vector label_mapping_old : dict[int, str] Current mapping of node label (integer) to taxa Note: Will be deprecated in a future release; see `queue_shuffle` instead. shuffle_cols : bool, optional If True, shuffle at random the order of the cherries in the ancestry matrix (i.e., the first two columns of the ancestry matrix). Returns ------- v_new : numpy.ndarray or list New Phylo2vec vector label_mapping_new : dict[int, str] New integer-taxon dictionary """ warnings.warn( ( "`reorder_v` is deprecated and will be removed " "in a future version. Use `queue_shuffle` instead." ), FutureWarning, ) # Reorder M if reorder_method == "birth_death": v_new, vec_mapping = queue_shuffle(v_old, shuffle_cherries=shuffle_cols) # For compatibility with the old label mapping (for _hc) label_mapping_new = { i: label_mapping_old[idx] for i, idx in enumerate(vec_mapping) } elif reorder_method == "bfs": raise ValueError( ( "`bfs` method is no longer supported as erroneous. " "Use `birth_death` instead." ) ) else: raise ValueError(f"Unknown value for `reorder_method`: {reorder_method}") return v_new, label_mapping_new
[docs] def get_common_ancestor(vector_or_matrix, node1, node2): """Get the most recent common ancestor between two nodes in a Phylo2Vec tree. `node1` and `node2` can be leaf nodes (0 to n_leaves) or internal nodes (n_leaves to 2*(n_leaves-1)). Similar to `get_common_ancestor` in ETE, but for Phylo2Vec vectors/matrices. Note: The MRCA is purely topological, so for matrices, only the vector component (column 0) is used. Parameters ---------- vector_or_matrix : numpy.ndarray Phylo2Vec vector (ndim == 1) or matrix (ndim == 2) node1 : int A node in the tree node2 : int A node in the tree Returns ------- mrca : int Most recent common ancestor node between node1 and node2 """ if vector_or_matrix.ndim == 2: v = vector_or_matrix[:, 0].astype(int) elif vector_or_matrix.ndim == 1: v = vector_or_matrix else: raise ValueError( "vector_or_matrix should either be a vector (ndim == 1) or matrix (ndim == 2)" ) if not (0 <= node1 <= 2 * len(v) and 0 <= node2 <= 2 * len(v)): raise ValueError("Nodes must be in the range [0, 2 * n_leaves]") return core.get_common_ancestor(v.tolist(), node1, node2)
[docs] def get_node_depth(vector_or_matrix, node: int) -> float: """Get the depth of a node in a Phylo2Vec tree. The depth of a node is the length of the path from the root to that node (i.e., distance from root). This follows the BEAST/ETE convention. The root has depth 0, and depths increase as you move toward the leaves. For vectors, topological depth is returned (all branch lengths = 1). For matrices, actual branch lengths are used. Parameters ---------- vector_or_matrix : numpy.ndarray Phylo2Vec vector (ndim == 1) or matrix (ndim == 2) node : int A node in the tree (0 to 2*n_leaves - 2) Returns ------- float Depth of the node (distance from root) Examples -------- >>> import numpy as np >>> from phylo2vec.utils.vector import get_node_depth >>> # Tree: (0,(1,(2,3)4)5)6 >>> v = np.array([0, 1, 2]) >>> get_node_depth(v, 6) # Root has depth 0 0.0 >>> get_node_depth(v, 0) # Leaf 0 is 1 edge from root 1.0 """ n_leaves = vector_or_matrix.shape[0] + 1 max_node = 2 * n_leaves - 1 if not 0 <= node < max_node: raise ValueError(f"node must be in the range [0, {max_node})") if vector_or_matrix.ndim == 2: return core.get_node_depth_with_bls(vector_or_matrix, node) if vector_or_matrix.ndim == 1: return core.get_node_depth(vector_or_matrix.tolist(), node) raise ValueError( "vector_or_matrix should either be a vector (ndim == 1) or matrix (ndim == 2)" )
[docs] def get_node_depths(vector_or_matrix) -> np.ndarray: """Get the depths of all nodes in a Phylo2Vec tree. The depth of a node is the length of the path from the root to that node (i.e., distance from root). This follows the BEAST/ETE convention. The root has depth 0, and depths increase as you move toward the leaves. For vectors, topological depth is returned (all branch lengths = 1). For matrices, actual branch lengths are used. Parameters ---------- vector_or_matrix : numpy.ndarray Phylo2Vec vector (ndim == 1) or matrix (ndim == 2) Returns ------- numpy.ndarray Depths of all nodes (length = 2 * n_leaves - 1). Index i contains the depth of node i. Examples -------- >>> import numpy as np >>> from phylo2vec.utils.vector import get_node_depths >>> # Tree: (0,(1,(2,3)4)5)6 >>> v = np.array([0, 1, 2]) >>> depths = get_node_depths(v) >>> depths[6] # Root has depth 0 0.0 >>> depths[0] # Leaf 0 is 1 edge from root 1.0 """ if vector_or_matrix.ndim == 2: return np.asarray(core.get_node_depths_with_bls(vector_or_matrix)) if vector_or_matrix.ndim == 1: return np.asarray(core.get_node_depths(vector_or_matrix.tolist())) raise ValueError( "vector_or_matrix should either be a vector (ndim == 1) or matrix (ndim == 2)" )
[docs] def reroot(vector_or_matrix, node) -> np.ndarray: """Reroot a tree in phylo2vec format at a given node Parameters ---------- vector_or_matrix : numpy.ndarray Phylo2Vec vector (ndim == 1)/matrix (ndim == 2) node : int A node to reroot the tree at Must be a valid node in the tree, i.e., in the range [0, 2 * n_leaves - 1] Cannot be the current root node, i.e., 2 * n_leaves Returns ------- numpy.ndarray rerooted vector """ if node < 0 or node > 2 * len(vector_or_matrix): raise ValueError( f"Node {node} is out of bounds for tree with " f"{len(vector_or_matrix)} leaves. " f"Valid range is [0, {2 * len(vector_or_matrix)}]" ) if node == 2 * len(vector_or_matrix): raise ValueError("Cannot reroot at the current root node") # Reroot via ete if vector_or_matrix.ndim == 1: # Using in_parser = 8 includes topology + all labels in_parser = 8 # Using out_parser = 8 omits the root node label # So we use out_parser = 9 (excludes all parent labels) out_parser = 9 elif vector_or_matrix.ndim == 2: # Using in_parser = 1 includes topology + branch lengths + all labels in_parser = 1 # Using out_parser = 1 omits the root node label # So we use out_parser = 5 (excludes all parent labels) out_parser = 5 else: raise ValueError( "Input must be a Phylo2Vec vector (ndim == 1) " "or matrix (ndim == 2)" ) # Convert to Newick format newick = to_newick(vector_or_matrix) # Reroot via ete ete_tree = Tree(newick, parser=in_parser) ete_tree.set_outgroup(f"{node}") # Write rerooted Newick newick_rerooted = ete_tree.write(parser=out_parser) # Convert back to phylo2vec format vector_or_matrix_rerooted = from_newick(newick_rerooted) return vector_or_matrix_rerooted
[docs] def reroot_at_random(vector_or_matrix) -> np.ndarray: """Reroot a tree in phylo2vec format at a random node Parameters ---------- vector_or_matrix : numpy.ndarray Phylo2Vec vector (ndim == 1)/matrix (ndim == 2) Returns ------- numpy.ndarray rerooted vector """ return reroot(vector_or_matrix, random.randint(0, 2 * len(vector_or_matrix) - 1))