vg
tools for working with variation graphs
|
#include <minimizer_mapper.hpp>
Classes | |
struct | Minimizer |
Public Types | |
enum | RescueAlgorithm { rescue_none, rescue_dozeu, rescue_gssw, rescue_haplotypes } |
Implemented rescue algorithms: no rescue, dozeu, GSSW, dozeu on local haplotypes. More... | |
Public Member Functions | |
MinimizerMapper (const gbwtgraph::GBWTGraph &graph, const std::vector< gbwtgraph::DefaultMinimizerIndex * > &minimizer_indexes, MinimumDistanceIndex &distance_index, const PathPositionHandleGraph *path_graph=nullptr) | |
void | map (Alignment &aln, AlignmentEmitter &alignment_emitter) |
vector< Alignment > | map (Alignment &aln) |
pair< vector< Alignment >, vector< Alignment > > | map_paired (Alignment &aln1, Alignment &aln2, vector< pair< Alignment, Alignment >> &ambiguous_pair_buffer) |
pair< vector< Alignment >, vector< Alignment > > | map_paired (Alignment &aln1, Alignment &aln2) |
bool | fragment_distr_is_finalized () |
void | finalize_fragment_length_distr () |
void | attempt_rescue (const Alignment &aligned_read, Alignment &rescued_alignment, bool rescue_forward) |
int64_t | distance_between (const Alignment &aln1, const Alignment &aln2) |
![]() | |
void | set_alignment_scores (int8_t match, int8_t mismatch, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
Set all the aligner scoring parameters and create the stored aligner instances. More... | |
void | set_alignment_scores (std::istream &matrix_stream, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
void | set_alignment_scores (const int8_t *score_matrix, int8_t gap_open, int8_t gap_extend, int8_t full_length_bonus) |
Public Attributes | |
size_t | hit_cap = 10 |
Use all minimizers with at most hit_cap hits. More... | |
size_t | hard_hit_cap = 300 |
Ignore all minimizers with more than hard_hit_cap hits. More... | |
double | minimizer_score_fraction = 0.6 |
size_t | max_extensions = 48 |
How many clusters should we align? More... | |
size_t | max_alignments = 8 |
How many extended clusters should we align, max? More... | |
size_t | max_local_extensions = numeric_limits<size_t>::max() |
How many extensions should we try as seeds within a mapping location? More... | |
double | cluster_score_threshold = 50 |
double | pad_cluster_score_threshold = 20 |
double | cluster_coverage_threshold = 0.4 |
double | extension_set_score_threshold = 20 |
int | extension_score_threshold = 1 |
size_t | max_multimaps = 1 |
size_t | distance_limit = 200 |
bool | do_dp = true |
string | sample_name |
string | read_group |
bool | track_provenance = false |
bool | track_correctness = false |
size_t | max_rescue_attempts = 0 |
For paired end mapping, how many times should we attempt rescue (per read)? More... | |
RescueAlgorithm | rescue_algorithm = rescue_dozeu |
The algorithm used for rescue. More... | |
![]() | |
bool | adjust_alignments_for_base_quality = false |
Protected Types | |
typedef SnarlSeedClusterer::Seed | Seed |
The information we store for each seed. More... | |
typedef SnarlSeedClusterer::Cluster | Cluster |
The information we store for each cluster. More... | |
using | ImmutablePath = structures::ImmutableList< Mapping > |
Protected Member Functions | |
std::vector< Minimizer > | find_minimizers (const std::string &sequence, Funnel &funnel) const |
std::vector< Seed > | find_seeds (const std::vector< Minimizer > &minimizers, const Alignment &aln, Funnel &funnel) const |
void | score_cluster (Cluster &cluster, size_t i, const std::vector< Minimizer > &minimizers, const std::vector< Seed > &seeds, size_t seq_length, Funnel &funnel) const |
std::vector< int > | score_extensions (const std::vector< std::vector< GaplessExtension >> &extensions, const Alignment &aln, Funnel &funnel) const |
std::vector< int > | score_extensions (const std::vector< std::pair< std::vector< GaplessExtension >, size_t >> &extensions, const Alignment &aln, Funnel &funnel) const |
double | phred_for_at_least_one (size_t p, size_t n) const |
double | compute_mapq_caps (const Alignment &aln, const std::vector< Minimizer > &minimizers, const SmallBitset &present_in_any_extended_cluster) |
double | window_breaking_quality (const vector< Minimizer > &minimizers, vector< size_t > &broken, const string &sequence, const string &quality_bytes) const |
void | find_optimal_tail_alignments (const Alignment &aln, const vector< GaplessExtension > &extended_seeds, Alignment &best, Alignment &second_best) const |
unordered_map< size_t, unordered_map< size_t, vector< Path > > > | find_connecting_paths (const vector< GaplessExtension > &extended_seeds, size_t read_length) const |
vector< TreeSubgraph > | get_tail_forest (const GaplessExtension &extended_seed, size_t read_length, bool left_tails) const |
pair< Path, size_t > | get_best_alignment_against_any_tree (const vector< TreeSubgraph > &trees, const string &sequence, const Position &default_position, bool pin_left) const |
void | dfs_gbwt (const Position &from, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const |
void | dfs_gbwt (handle_t from_handle, size_t from_offset, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const |
void | dfs_gbwt (const gbwt::SearchState &start_state, size_t from_offset, size_t walk_distance, const function< void(const handle_t &)> &enter_handle, const function< void(void)> exit_handle) const |
template<typename Item , typename Score = double> | |
void | process_until_threshold_a (const vector< Item > &items, const function< Score(size_t)> &get_score, double threshold, size_t min_count, size_t max_count, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const |
template<typename Item , typename Score = double> | |
void | process_until_threshold_b (const vector< Item > &items, const vector< Score > &scores, double threshold, size_t min_count, size_t max_count, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const |
template<typename Item , typename Score = double> | |
void | process_until_threshold_c (const vector< Item > &items, const function< Score(size_t)> &get_score, const function< bool(size_t, size_t)> &comparator, double threshold, size_t min_count, size_t max_count, const function< bool(size_t)> &process_item, const function< void(size_t)> &discard_item_by_count, const function< void(size_t)> &discard_item_by_score) const |
![]() | |
AlignerClient (double gc_content_estimate=vg::default_gc_content) | |
const GSSWAligner * | get_aligner (bool have_qualities=true) const |
const QualAdjAligner * | get_qual_adj_aligner () const |
const Aligner * | get_regular_aligner () const |
Static Protected Member Functions | |
static int | score_extension_group (const Alignment &aln, const vector< GaplessExtension > &extended_seeds, int gap_open_penalty, int gap_extend_penalty) |
static size_t | immutable_path_from_length (const ImmutablePath &path) |
static Path | to_path (const ImmutablePath &path) |
Protected Attributes | |
const PathPositionHandleGraph * | path_graph |
const std::vector< gbwtgraph::DefaultMinimizerIndex * > & | minimizer_indexes |
MinimumDistanceIndex & | distance_index |
const gbwtgraph::GBWTGraph & | gbwt_graph |
This is our primary graph. More... | |
GaplessExtender | extender |
We have a gapless extender to extend seed hits in haplotype space. More... | |
SnarlSeedClusterer | clusterer |
We have a clusterer. More... | |
FragmentLengthDistribution | fragment_length_distr |
std::vector< double > | phred_at_least_one |
Static Protected Attributes | |
constexpr static size_t | PRECISION = 8 |
Use this many bits for approximate probabilities. More... | |
Additional Inherited Members | |
![]() | |
static int8_t * | parse_matrix (std::istream &matrix_stream) |
Allocates an array to hold a 4x4 substitution matrix and returns it. More... | |
|
protected |
The information we store for each cluster.
|
protected |
We define a type for shared-tail lists of Mappings, to avoid constantly copying Path objects.
|
protected |
The information we store for each seed.
vg::MinimizerMapper::MinimizerMapper | ( | const gbwtgraph::GBWTGraph & | graph, |
const std::vector< gbwtgraph::DefaultMinimizerIndex * > & | minimizer_indexes, | ||
MinimumDistanceIndex & | distance_index, | ||
const PathPositionHandleGraph * | path_graph = nullptr |
||
) |
Construct a new MinimizerMapper using the given indexes. The PathPositionhandleGraph can be nullptr, as we only use it for correctness tracking.
void vg::MinimizerMapper::attempt_rescue | ( | const Alignment & | aligned_read, |
Alignment & | rescued_alignment, | ||
bool | rescue_forward | ||
) |
Given an aligned read, extract a subgraph of the graph within a distance range based on the fragment length distribution and attempt to align the unaligned read to it. Rescue_forward is true if the aligned read is the first and false otherwise. Assumes that both reads are facing the same direction. TODO: This should be const, but some of the function calls are not.
|
protected |
Compute MAPQ caps based on all minimizers present in extended clusters.
Needs access to the input alignment for sequence and quality information.
Returns only an "extended" cap at the moment.
|
protected |
The same as dfs_gbwt on a handle and an offset, but takes a gbwt::SearchState that defines only some haplotypes on a handle to start with.
|
protected |
Run a DFS on valid haplotypes in the GBWT starting from the given Position, and continuing up to the given number of bases.
Calls enter_handle when the DFS enters a haplotype visit to a particular handle, and exit_handle when it exits a visit. These let the caller maintain a stack and track the traversals.
The starting node is only entered if its offset isn't equal to its length (i.e. bases remain to be visited).
Stopping early is not permitted.
|
protected |
The same as dfs_gbwt on a Position, but takes a handle in the backing gbwt_graph and an offset from the start of the handle instead.
Get the distance between a pair of read alignments
|
inline |
|
protected |
Find for each pair of extended seeds all the haplotype-consistent graph paths against which the intervening read sequence needs to be aligned.
Limits walks from each extended seed end to the longest detectable gap plus the remaining to-be-alinged sequence, both computed using the read length.
extended_seeds must be sorted by read start position. Any extended seeds that overlap in the read will be precluded from connecting.
numeric_limits<size_t>::max() is used to store sufficiently long Paths ending before sources (which cannot be reached from other extended seeds) and starting after sinks (which cannot reach any other extended seeds). Only sources and sinks have these "tail" paths.
Tail paths are only calculated if the MinimizerMapper has linear_tails set to true.
|
protected |
Find the minimizers in the sequence using all minimizer indexes and return them sorted in descending order by score.
|
protected |
Operating on the given input alignment, align the tails dangling off the given extended perfect-match seeds and produce an optimal alignment into the given output Alignment object, best, and the second best alignment into second_best.
|
protected |
Find seeds for all minimizers passing the filters.
|
inline |
|
protected |
Find the best alignment of the given sequence against any of the trees provided in trees, where each tree is a TreeSubgraph over the GBWT graph. Each tree subgraph is rooted at the left in its own local coordinate space, even if we are pinning on the right.
If no mapping is possible (for example, because there are no trees), produce a pure insert at default_position.
Alignment is always pinned.
If pin_left is true, pin the alignment on the left to the root of each tree. Otherwise pin it on the right to the root of each tree.
Returns alingments in gbwt_graph space.
|
protected |
Get all the trees defining tails off the specified side of the specified gapless extension. Should only be called if a tail on that side exists, or this is a waste of time.
If the gapless extension starts or ends at a node boundary, there may be multiple trees produced, each with a distinct root.
If the gapless extension abuts the edge of the read, an empty forest will be produced.
Each tree is represented as a TreeSubgraph over our gbwt_graph.
If left_tails is true, the trees read out of the left sides of the gapless extension. Otherwise they read out of the right side.
|
staticprotected |
Get the from length of an ImmutabelPath.
Can't be called path_from_length or it will shadow the one for Paths instead of overloading.
Map the given read. Return a vector of alignments that it maps to, winner first.
void vg::MinimizerMapper::map | ( | Alignment & | aln, |
AlignmentEmitter & | alignment_emitter | ||
) |
Map the given read, and send output to the given AlignmentEmitter. May be run from any thread. TODO: Can't be const because the clusterer's cluster_seeds isn't const.
pair< vector< Alignment >, vector< Alignment > > vg::MinimizerMapper::map_paired | ( | Alignment & | aln1, |
Alignment & | aln2 | ||
) |
Map the given pair of reads, where aln1 is upstream of aln2 and they are oriented towards each other in the graph.
If the fragment length distribution is not yet fixed, reads will be mapped independently. Otherwise, they will be mapped according to the fragment length distribution.
pair< vector< Alignment >, vector< Alignment > > vg::MinimizerMapper::map_paired | ( | Alignment & | aln1, |
Alignment & | aln2, | ||
vector< pair< Alignment, Alignment >> & | ambiguous_pair_buffer | ||
) |
Map the given pair of reads, where aln1 is upstream of aln2 and they are oriented towards each other in the graph.
If the reads are ambiguous and there's no fragment length distribution fixed yet, they will be dropped into ambiguous_pair_buffer.
Otherwise, at least one result will be returned for them (although it may be the unmapped alignment).
|
protected |
Assume that we have n <= max_k independent random events that occur with probability p each (p is interpreted as a real number between 0 and 1 and max_k is the largest k in the minimizer indexes). Return an approximate probability for at least one event occurring as a phred score.
|
protected |
Given a vector of items, a function to get the score of each, a score-difference-from-the-best cutoff, and a min and max processed item count, process items in descending score order by calling process_item with the item's number, until min_count items are processed and either max_count items are processed or the score difference threshold is hit (or we run out of items).
If process_item returns false, the item is skipped and does not count against min_count or max_count.
Call discard_item_by_count with the item's number for all remaining items that would pass the score threshold.
Call discard_item_by_score with the item's number for all remaining items that would fail the score threshold.
|
protected |
Same as the other process_until_threshold functions, except using a vector to supply scores.
|
protected |
Same as the other process_until_threshold functions, except user supplies comparator to sort the items (must still be sorted by score).
|
protected |
Determine cluster score, read coverage, and a vector of flags for the minimizers present in the cluster. Score is the sum of the scores of distinct minimizers in the cluster, while read coverage is the fraction of the read covered by seeds in the cluster.
|
staticprotected |
Score the given group of gapless extensions. Determines the best score that can be obtained by chaining extensions together, using the given gap open and gap extend penalties to charge for either overlaps or gaps in coverage of the read.
Enforces that overlaps cannot result in containment.
Input extended seeds must be sorted by start position.
|
protected |
Score the set of extensions for each cluster using score_extension_group(). Return the scores in the same order as the extensions.
|
protected |
Score the set of extensions for each cluster using score_extension_group(). Return the scores in the same order as the extensions.
|
staticprotected |
Convert an ImmutablePath to a Path.
|
protected |
Compute a bound on the Phred score probability of having created the agglomerations of the specified minimizers by base errors from the given sequence, which was sequenced with the given qualities.
No limit is imposed if broken is empty.
Takes the collection of all minimizers found, and a vector of the indices of minimizers we are interested in the agglomerations of. May modify the order of that index vector.
Also takes the sequence of the read (to avoid Ns) and the quality string (interpreted as a byte array).
Currently computes a lower-score-bound, upper-probability-bound, suitable for use as a mapping quality cap, by assuming the easiest-to-disrupt possible layout of the windows, and the lowest possible qualities for the disrupting bases.
double vg::MinimizerMapper::cluster_coverage_threshold = 0.4 |
double vg::MinimizerMapper::cluster_score_threshold = 50 |
|
protected |
We have a clusterer.
|
protected |
size_t vg::MinimizerMapper::distance_limit = 200 |
bool vg::MinimizerMapper::do_dp = true |
|
protected |
We have a gapless extender to extend seed hits in haplotype space.
int vg::MinimizerMapper::extension_score_threshold = 1 |
double vg::MinimizerMapper::extension_set_score_threshold = 20 |
|
protected |
|
protected |
This is our primary graph.
size_t vg::MinimizerMapper::hard_hit_cap = 300 |
Ignore all minimizers with more than hard_hit_cap hits.
size_t vg::MinimizerMapper::hit_cap = 10 |
Use all minimizers with at most hit_cap hits.
size_t vg::MinimizerMapper::max_alignments = 8 |
How many extended clusters should we align, max?
size_t vg::MinimizerMapper::max_extensions = 48 |
How many clusters should we align?
size_t vg::MinimizerMapper::max_local_extensions = numeric_limits<size_t>::max() |
How many extensions should we try as seeds within a mapping location?
size_t vg::MinimizerMapper::max_multimaps = 1 |
size_t vg::MinimizerMapper::max_rescue_attempts = 0 |
For paired end mapping, how many times should we attempt rescue (per read)?
|
protected |
double vg::MinimizerMapper::minimizer_score_fraction = 0.6 |
Take minimizers between hit_cap and hard_hit_cap hits until this fraction of total score
double vg::MinimizerMapper::pad_cluster_score_threshold = 20 |
|
protected |
|
protected |
Assume that we have n <= max_k independent events with probability p each. Let x be the PRECISION most significant bits of p. Then
phred_at_least_one[(n << PRECISION) + x]
is an approximate phred score of at least one event occurring.
|
staticconstexprprotected |
Use this many bits for approximate probabilities.
string vg::MinimizerMapper::read_group |
RescueAlgorithm vg::MinimizerMapper::rescue_algorithm = rescue_dozeu |
The algorithm used for rescue.
string vg::MinimizerMapper::sample_name |
bool vg::MinimizerMapper::track_correctness = false |
Guess which seed hits are correct by location in the linear reference and track if/when their descendants make it through stages of the algorithm. Only works if track_provenance is true.
bool vg::MinimizerMapper::track_provenance = false |
Track which internal work items came from which others during each stage of the mapping algorithm.