-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalignment.h
365 lines (320 loc) · 17.6 KB
/
alignment.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
#ifndef _ALIGNMENT_H_
#define _ALIGNMENT_H_
#include "codes.h"
#include "read_quality.h"
/* -------------------------------------------------------
* Score
*/
typedef int ScoreType;
/* -------------------------------------------------------
* Alignment cell 'metadata': direction of the current partial alignment
*/
typedef char DirectionType;
/* -------------------------------------------------------
* Alignment parameters
*/
/* Default alignment parameters (scores, penalties) */
#define DEFAULT_MATCH (+5)
#define DEFAULT_MISMATCH (-4)
#define DEFAULT_GAP_OPEN (-9)
#define DEFAULT_GAP_EXTEND (-4)
#define DEFAULT_ALLOWED_INDELS (3)
#define SCALE_SCORE(score, quality_level) (ScoreType)( ((score) * (quality_level+1) + 0.) / READ_QUALITY_LEVELS)
/* For accessing various parameters */
#define GAP_TYPES_SIZE 2
#define GAP_OPEN_IDX 1
#define GAP_EXTEND_IDX 0
#define SYMBOL_PAIR_TYPES_SIZE 2
#define SYMBOL_PAIR_MATCH_IDX 0
#define SYMBOL_PAIR_MISMATCH_IDX 1
/* Symbols used for displaying the alignment */
#define SEQ_GAP_SYMBOL '-'
/* Used for pre-computed alignment parameter quick access: */
#define INDEX_FOR(code) ((code)?1:0)
/* Used as default "bad" score, or as the worst penalty (must be a strong negative) */
#define MIN_SCORE (-16383)
/* Score threshold */
#define MIN_ACCEPTED_SCORE 115
#define MIN_ACCEPTED_SCORE_SIMD_FILTER 100
extern ScoreType min_accepted_score;
extern ScoreType min_accepted_score_simd_filter;
/* For traceback: 4bit codes for various types of pairs : two 4bits codes are stored in one byte */
/* mask for separating upped/lower part of each pair */
#define TRACEBACK_PAIR_MASK 0xC /* 11XX */
#define TRACEBACK_CODE_MASK 0x3 /* XX11 */
/* codes for traceback : note that some can overlap letter codes !!! */
#define TRACEBACK_PAIR_MATCH 0x3 /* 0011 */
#define TRACEBACK_PAIR_CODE_MISMATCH 0x4 /* 01XX (XX = (color) code in the read) */
#ifndef NUCLEOTIDES
#define TRACEBACK_PAIR_COLORBASE_MISMATCH 0x8 /* 10XX (XX = color code in the read) */
#endif
#define TRACEBACK_PAIR_READ_INSERTION 0xC /* 11XX (XX = (color) code inserted in the read) */
#define TRACEBACK_PAIR_READ_DELETION 0x1 /* 0001 */
#define TRACEBACK_PAIR_UNDEFINED 0x0 /* 0000 */
/* wrap colors in their backtracing codes */
#define TRACEBACK_MATCH (TRACEBACK_PAIR_MATCH)
#ifndef NUCLEOTIDES
#define TRACEBACK_COLORBASE_MISMATCH(code) (TRACEBACK_PAIR_COLORBASE_MISMATCH | (code))
#endif
#define TRACEBACK_CODE_MISMATCH(code) (TRACEBACK_PAIR_CODE_MISMATCH | (code))
#define TRACEBACK_INSERT(code) (TRACEBACK_PAIR_READ_INSERTION | (code))
#define TRACEBACK_DELETE (TRACEBACK_PAIR_READ_DELETION)
/* access : two 4bits codes are stored in one byte */
#define TRACEBACK_SEQ_SIZE(size) (((size) + 1) >> 1)
#define TRACEBACK_SEQ_GET(seq, i) ((seq[(i) >> 1] >> (((i) & (1)) ? 0 : 4)) & 0xF)
#define TRACEBACK_SEQ_SET(seq, i, code) seq[(i) >> 1] = ((i) & 1) ? ((seq[(i) >> 1] & 0xF0) | ((code) & 0x0F)) : ((seq[(i) >> 1] & 0x0F) | (((code) << 4) & 0xF0))
#define TRACEBACK_SEQ_FILL(seq, len, code, iterator) for (iterator = 0; iterator < len; ++iterator) {TRACEBACK_SEQ_SET(seq, i, code);}
#ifdef NUCLEOTIDES
#define GET_PAIR_TYPE(code) ((((code) & TRACEBACK_PAIR_MASK) == TRACEBACK_PAIR_CODE_MISMATCH) ? TRACEBACK_PAIR_CODE_MISMATCH : \
(((code) & TRACEBACK_PAIR_MASK) == TRACEBACK_PAIR_READ_INSERTION) ? TRACEBACK_PAIR_READ_INSERTION : (code))
#else
#define GET_PAIR_TYPE(code) ((((code) & TRACEBACK_PAIR_MASK) == TRACEBACK_PAIR_CODE_MISMATCH) ? TRACEBACK_PAIR_CODE_MISMATCH : \
((((code) & TRACEBACK_PAIR_MASK) == TRACEBACK_PAIR_COLORBASE_MISMATCH) ? TRACEBACK_PAIR_COLORBASE_MISMATCH : \
((((code) & TRACEBACK_PAIR_MASK) == TRACEBACK_PAIR_READ_INSERTION) ? TRACEBACK_PAIR_READ_INSERTION : (code))))
#endif
#ifdef NUCLEOTIDES
#define GET_PAIR_TYPE_SYMBOL(code) ((((code) == TRACEBACK_PAIR_MATCH) ? 'M' : \
((((code) & TRACEBACK_PAIR_MASK) == TRACEBACK_PAIR_CODE_MISMATCH) ? 'S' : \
((((code) & TRACEBACK_PAIR_MASK) == TRACEBACK_PAIR_READ_INSERTION) ? 'I' : \
(( (code) == TRACEBACK_PAIR_READ_DELETION) ? 'D' : ('?' + code))))))
#else
#define GET_PAIR_TYPE_SYMBOL(code) (( (code) == TRACEBACK_PAIR_MATCH) ? 'M' : \
((((code) & TRACEBACK_PAIR_MASK) == TRACEBACK_PAIR_CODE_MISMATCH) ? 'E' : \
((((code) & TRACEBACK_PAIR_MASK) == TRACEBACK_PAIR_COLORBASE_MISMATCH) ? 'S' : \
((((code) & TRACEBACK_PAIR_MASK) == TRACEBACK_PAIR_READ_INSERTION) ? 'I' : \
(( (code) == TRACEBACK_PAIR_READ_DELETION) ? 'D' : ('?' + code))))))
#endif
#define TRACEBACK_DISPLAY(sequence, length, output) { \
fprintf(output, " "); \
int _i; \
for (_i = 0; _i < (length); ++_i) { \
fprintf(output, "%c", \
GET_PAIR_TYPE_SYMBOL(TRACEBACK_SEQ_GET((sequence), _i))); \
} \
fprintf(output, "\n "); \
for (_i = 0; _i < length; ++_i) { \
unsigned char __pts = GET_PAIR_TYPE_SYMBOL(TRACEBACK_SEQ_GET(sequence, _i)); \
fprintf(output, "%c", (__pts == 'M' || __pts == 'D') ? ' ' : ('0' + (TRACEBACK_CODE_MASK & TRACEBACK_SEQ_GET(sequence, _i)))); \
} \
}
/**
* @struct AlignmentParamType
* keep the scoring parameters
*/
typedef struct AlignmentParamType {
/** Opening, extension... */
ScoreType gap[GAP_TYPES_SIZE];
/** Match and mismatch scores are pre-computed for several quality intervals
* by the __set_params function.
*/
ScoreType pair_scores[SYMBOL_PAIR_TYPES_SIZE][READ_QUALITY_LEVELS];
/** Number of allowed indels */
short allowed_indels;
} AlignmentParamType;
#ifndef NUCLEOTIDES
/* Value given to the color_mismatch_freshness field in the alignment cells when
* there is no uncorrected color mismatch on the path, or the last color mismatch is too far :
*/
#define COLOR_MISMATCH_NONE 0
/* Value given to the color_mismatch_freshness field in the alignment cells where
* there was a color mismatch that changed the checksum from CODE_IDENTITY.
* This value 'wears off' in each as the path continues without any correction.
*/
#define COLOR_MISMATCH_NOW 4
#endif
/**
* @struct AlignmentCellType
* The data held in a cell from the alignment matrix
*/
typedef struct AlignmentCellType {
/** Partial score of the partial alignment that ends in this cell */
ScoreType score;
#ifndef NUCLEOTIDES
/** To differentiate (for example) two reading errors from a SNP*/
CODE_TYPE color_checksum;
#endif
/** For backtracking: relative coordinates of the cell that contributed to this score */
DirectionType from;
/** Number of indels existing so far on the path leading to this cell*/
short indels;
#ifndef NUCLEOTIDES
/** If at some point there was a color mismatch, this field states how fresh it is; if the mismatch was
* many cases ago in the path, we consider it is unlikely to be part of an SNP sequence and treat it
* as a misread color. Therefore, we no longer expect a color correction pair on the path, and instead
* correct the color checksum automatically */
short color_mismatch_freshness;
/** The code used for automatic correction in the case described above.
* Note: the neutral (and default) code is BLUE (=CODE_IDENTITY), and each color is its own inverse. */
CODE_TYPE color_autocorrect;
#endif
} AlignmentCellType;
/* ------------------------------------------------------
* The data necessary to build an alignment
*/
/**
* @struct AlignmentType
* The alignment data structure
*/
typedef struct AlignmentType {
/** Parameters: match, mismatch, gap */
AlignmentParamType params;
/** Sizes of the aligned sequences */
/** @{ */
short read_len,ref_len;
/** @} */
/** The read */
CODE_TYPE* read;
/** Its quality */
ScoreType* read_quality;
/** The reference sequence */
CODE_TYPE* reference;
/** The reference masked (N) */
CODE_TYPE* reference_masked;
/** The alignment matrix ("allocated" one) */
AlignmentCellType** matrix_alloc;
/** The alignment matrix */
AlignmentCellType** matrix;
/** The matrix is not entirely stored into memory; the range fields tells which parts are stored from each line */
/** @{ */
short* range_first;
short range_shift;
/** @} */
/** Coordinates of the cell where the best scoring alignment ends, set by the __align function */
/** @{ */
short best_i, best_j;
/** @} */
/** Coordinates of the cell where the best scoring alignment starts, set by the __traceback function */
/** @{ */
short best_i0, best_j0;
/** @} */
/** Representation of the alignment, set (and returned) by the __traceback function */
char* to_display;
/** traceback sequence of bytes filled with the TRACEBACK_MATCH(...), TRACEBACK_CODE_MISMATCH(...),
* TRACEBACK_BASE_MISMATCH(...), TRACEBACK_DELETE(...) and TRACEBACK_INSERT(...) macros
*/
unsigned char* traceback;
/** traceback length */
int traceback_seq_len;
} AlignmentType;
/**
* Alignment structures creation and initialization
* @param read Read sequence
* @param read_qualiry Read sequence quality
* @param read_len Read length
* @param reference Reference sequence, to which the read is aligned
* @param read_len Reference sequence length
* @param allowed_indels Max number of allowed indels in the alignment; usually small, it prevents from computing the entire DP matrix, thus the reducing computation time.
* @return The address of the alignment data structure
*/
AlignmentType* alignment__create(short read_len, short ref_len, short allowed_indels);
/**
* Alignment structures creation and initialization, for just one use (read and reference are already provided)
* @param read Read sequence
* @param read_quality Read sequence quality
* @param read_len Read length
* @param reference Reference sequence, to which the read is aligned
* @param reference_masked Reference sequence mask, to which the read is aligned
* @param allowed_indels Max number of allowed indels in the alignment; usually small, it prevents from computing the entire DP matrix, thus the reducing computation time.
* @return The address of the alignment data structure
*/
AlignmentType* alignment__create_single(CODE_TYPE* read, short* read_quality, short read_len, CODE_TYPE* reference, CODE_TYPE* reference_masked, short ref_len, short allowed_indels);
/**
* Alignment structures creation and initialization
* @param alignment
* @param read_len Read length
* @param read_len Reference sequence length
* @param allowed_indels Max number of allowed indels in the alignment; usually small, it prevents from computing the entire DP matrix, thus the reducing computation time.
*/
void alignment__init(AlignmentType* alignment, short read_len, short ref_len, short allowed_indels);
/**
* Alignment algorithm parameter settings.
* @param alignment The address of the alignment data structure
* @param match The match score
* @param mismatch The mismatch penalty
* @param gap_open The penalty for gap opening
* @param gap_extend The penalty for gap extension
*/
void alignment__set_params(AlignmentType * alignment, short match, short mismatch, short gap_open, short gap_extend);
/**
* Perform the actual alignment
* @param alignment The address of the alignment data structure
* @param min_accepted Give up the alignment as soon as it can be shown that it has no chane to reach this minimum imposed score
* @return The score of the best alignment
*/
int alignment__align(AlignmentType * alignment, ScoreType min_accepted);
/**
* Reconstructs the alignment from the matrix
* @param alignment The address of the alignment data structure
* @param i The line number of the best scoring cell (where the best alignment ends)
* @param j The column number of the best scoring cell (where the best alignment ends)
* @return A string containing the representation of this alignment (which the to_display field in the alignment structure)
*/
char* alignment__traceback(AlignmentType *alignment);
/**
* Reuse the alignment data structure, when the read and reference have the same lengths as the previous ones aligned.
* @param alignment The address of the data structure to reuse
* @param read The new read sequence, known to have the length alignment->read_len
* @param read_quality The new read quality sequence, known to have the length alignment->read_len
* @param reference The new reference sequence, known to have the length alignment->ref_len
* @param reference_masked The new reference sequence mask, known to have the length alignment->ref_len
*/
void alignment__reset(AlignmentType *alignment, CODE_TYPE* read, short* read_quality, CODE_TYPE* reference, CODE_TYPE* reference_masked);
/**
* Reuse the alignment data structure, when the read and reference have the same lengths as the previous ones aligned, and come from compressed sources.
* @param alignment The address of the data structure to reuse
* @param read The new read sequence, known to have the length alignment->read_len
* @param read_quality The new read quality sequence, known to have the length alignment->read_len
* @param reference The new reference sequence, known to have the length alignment->ref_len
* @param reference_masked The new reference sequence mask, known to have the length alignment->ref_len
* @param offset How many codes to skip in the compressed reference and reference_masked
*/
void alignment__reset_with_compressed(AlignmentType *alignment, CODE_TYPE* read, QUAL_TYPE* read_quality, CODE_TYPE* reference, CODE_TYPE* reference_masked, int ref_offset);
/**
* Reuse the alignment data structure, for a new read that has the same length as the previous one, and comes from a compressed source.
* the matrix is not initialized, as the new reference is also ewpected.
* @param alignment The address of the data structure to reuse
* @param reference The new reference sequence, known to have the length alignment->ref_len
* @param offset How many codes to skip in the compressed reference
*/
void alignment__reset_with_compressed_read(AlignmentType *alignment,CODE_TYPE* read, QUAL_TYPE* read_quality) ;
/**
* Reuse the alignment data structure, when the read stays the same and reference has the same lengths as the previous one, and comes from a compressed source.
* @param alignment The address of the data structure to reuse
* @param reference The new reference sequence, known to have the length alignment->ref_len
* @param reference_masked The new reference mask sequence, known to have the length alignment->ref_len
* @param offset How many codes to skip in the compressed reference
*/
void alignment__reset_with_compressed_ref(AlignmentType *alignment, CODE_TYPE* reference, CODE_TYPE* reference_masked, int ref_offset);
/**
* Retrieve a score in the alignment matrix, with coordinates that are not "aware" of the matrix' sparseness
* @param alignment The address of the alignment data structure
* @param i The requested cell's line coordinate
* @param j The requested cell's column coordinate, as a number from 0 to reference length (to be transformed in actual matrix coords)
* @return The found score, or MIN_SCORE if the parameters are out of range
*/
ScoreType alignment__get_score(const AlignmentType * alignment, int i, int j);
/**
* Frees the memory of an alignment
* @param alignment The address of the data structure to be disposed of
*/
void alignment__destroy(AlignmentType *alignment);
/**
* Generates the CIGAR string and the right DNA sequence (no reading errors) from
* the code sequence alignment.
*
* @param traceback a compressed representation of the alignment between the read and the reference
* @param traceback_len the length of this alignment
* @param ref_sequence pointer to the beginning of the reference fragment aligned to the read
* @param offset the offset of the beginning
* @param ref_start_symbol the first symbol of the reference sequence
* @param CIGAR_dest the CIGAR representation will be written here
* @param SEQ_dest the DNA sequence will be written here
* @return The edit distance
*/
int alignment__traceback_to_CIGAR(const unsigned char* traceback, const int traceback_len,
const CODE_TYPE* ref_sequence, const int offset,
#ifndef NUCLEOTIDES
const char ref_start_symbol,
#endif
char* CIGAR_dest, char* SEQ_dest);
#endif /* _ALIGNMENT_H_ */