+
Skip to content

bwa mem 20% speedup with vector instructions (Fixed intel-extend branch) #226

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ test64
.*.swp
Makefile.bak
bwamem-lite
check_avx2_support
56 changes: 49 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
CC= gcc
#CC= clang --analyze
CFLAGS= -g -Wall -Wno-unused-function -O2
CFLAGS= -g -Wall -Wno-unused-function -O3
CXXFLAGS= -g -Wall -Wno-unused-variable -O3
WRAP_MALLOC=-DUSE_MALLOC_WRAPPERS
AR= ar
DFLAGS= -DHAVE_PTHREAD $(WRAP_MALLOC)
LOBJS= utils.o kthread.o kstring.o ksw.o bwt.o bntseq.o bwa.o bwamem.o bwamem_pair.o bwamem_extra.o malloc_wrap.o \
intel_ext.o intel_opt/fast_extend_engine.o intel_opt/extend_vec.o \
QSufSort.o bwt_gen.o rope.o rle.o is.o bwtindex.o
AOBJS= bwashm.o bwase.o bwaseqio.o bwtgap.o bwtaln.o bamlite.o \
bwape.o kopen.o pemerge.o maxk.o \
Expand All @@ -19,27 +21,61 @@ ifeq ($(shell uname -s),Linux)
LIBS += -lrt
endif

.SUFFIXES:.c .o .cc
COMPILER_VER:=$(shell $(CC) -dumpversion | \
sed -e 's/\.\([0-9][0-9]\)/\1/g' \
-e 's/\.\([0-9]\)/0\1/g' \
-e 's/^[0-9]\{3,4\}$$/&00/' )

AVX2_SUPPORTED:=$(shell $(CC) check_avx2_support.c -o check_avx2_support ; ./check_avx2_support | grep Yes)

ifeq "$(CC)" "gcc"
ifeq "$(AVX2_SUPPORTED)" "Yes"
ifeq "$(shell expr $(COMPILER_VER) \>= 40901)" "1"
$(info AVX2 Supported and GCC >= 4.9.1 detected, enabling Intel optimizations with AVX2.)
DFLAGS += -DUSE_AVX2
CFLAGS += -mavx -mavx2
CXXFLAGS += -mavx -mavx2
else
$(info AVX2 Supported but GCC <= 4.9.1 detected, enabling Intel optimizations with SSE4.)
CFLAGS += -msse -msse2 -mssse3 -msse4 -msse4.1 -msse4.2
CXXFLAGS += -msse -msse2 -mssse3 -msse4 -msse4.1 -msse4.2
endif
else
$(info AVX2 Not Supported, enabling Intel optimizations with SSE4.)
CFLAGS += -msse -msse2 -mssse3 -msse4 -msse4.1 -msse4.2
CXXFLAGS += -msse -msse2 -mssse3 -msse4 -msse4.1 -msse4.2
endif
endif


.SUFFIXES:.c .o .cc .cpp


.c.o:
$(CC) -c $(CFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@

.cpp.o:
$(CXX) -c $(CXXFLAGS) $(DFLAGS) $(INCLUDES) $< -o $@

all:$(PROG)

bwa:libbwa.a $(AOBJS) main.o
$(CC) $(CFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)
$(CXX) $(CXXFLAGS) $(DFLAGS) $(AOBJS) main.o -o $@ -L. -lbwa $(LIBS)

bwamem-lite:libbwa.a example.o
$(CC) $(CFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS)
$(CXX) $(CXXFLAGS) $(DFLAGS) example.o -o $@ -L. -lbwa $(LIBS)

libbwa.a:$(LOBJS)
$(AR) -csru $@ $(LOBJS)

ed_intrav.o:ed_intrav.cpp
$(CXX) -c $(CXXFLAGS) $(INCLUDES) -DSW_FILTER_AND_EXTEND -o $@ $<

clean:
rm -f gmon.out *.o a.out $(PROG) *~ *.a
rm -f gmon.out *.o a.out $(PROG) *~ *.a intel_opt/*.o check_avx2_support

depend:
( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c )
( LC_ALL=C ; export LC_ALL; makedepend -Y -- $(CFLAGS) $(DFLAGS) -- *.c *.cpp )

# DO NOT DELETE THIS LINE -- make depend depends on it.

Expand All @@ -49,7 +85,7 @@ bntseq.o: bntseq.h utils.h kseq.h malloc_wrap.h khash.h
bwa.o: bntseq.h bwa.h bwt.h ksw.h utils.h kstring.h malloc_wrap.h kvec.h
bwa.o: kseq.h
bwamem.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h ksw.h kvec.h
bwamem.o: ksort.h utils.h kbtree.h
bwamem.o: ksort.h utils.h intel_ext.h kbtree.h
bwamem_extra.o: bwa.h bntseq.h bwt.h bwamem.h kstring.h malloc_wrap.h
bwamem_pair.o: kstring.h malloc_wrap.h bwamem.h bwt.h bntseq.h bwa.h kvec.h
bwamem_pair.o: utils.h ksw.h
Expand All @@ -75,6 +111,7 @@ bwtsw2_pair.o: utils.h bwt.h bntseq.h bwtsw2.h bwt_lite.h kstring.h
bwtsw2_pair.o: malloc_wrap.h ksw.h
example.o: bwamem.h bwt.h bntseq.h bwa.h kseq.h malloc_wrap.h
fastmap.o: bwa.h bntseq.h bwt.h bwamem.h kvec.h malloc_wrap.h utils.h kseq.h
fastmap.o: intel_ext.h
is.o: malloc_wrap.h
kopen.o: malloc_wrap.h
kstring.o: kstring.h malloc_wrap.h
Expand All @@ -86,3 +123,8 @@ pemerge.o: ksw.h kseq.h malloc_wrap.h kstring.h bwa.h bntseq.h bwt.h utils.h
rle.o: rle.h
rope.o: rle.h rope.h
utils.o: utils.h ksort.h malloc_wrap.h kseq.h
ed_intrav.o: ed_intrav64.h ed_intrav64x2.h ed_intrav.h ed_intravED.h
ed_intrav.o: ed_fine.h
intel_opt/fast_extend_engine.o: intel_opt/fast_extend.h
intel_opt/extend_vec.o: intel_opt/extend_vec128.h intel_opt/extend_vec128x2.h intel_opt/extend_vec256.h intel_opt/extend_vec256x2.h
intel_ext.o: intel_ext.h
17 changes: 16 additions & 1 deletion bwamem.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#include "ksort.h"
#include "utils.h"

#include "intel_ext.h"

#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
#endif
Expand Down Expand Up @@ -720,6 +722,11 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
tmp = s->rbeg - rmax[0];
rs = malloc(tmp);
for (i = 0; i < tmp; ++i) rs[i] = rseq[tmp - 1 - i];
if (opt->flag & MEM_F_FASTEXT) {
a->score = intel_extend(s->qbeg, qs, tmp, rs, 5, opt->mat, opt->o_del, opt->e_del, aw[0], opt->pen_clip5, opt->zdrop, s->len * opt->a, &qle, &tle, &gtle, &gscore, &max_off[0]);
if (a->score != -1)
goto end_left_extend;
}
for (i = 0; i < MAX_BAND_TRY; ++i) {
int prev = a->score;
aw[0] = opt->w << i;
Expand All @@ -732,6 +739,8 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
if (bwa_verbose >= 4) { printf("*** Left extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[0], max_off[0]); fflush(stdout); }
if (a->score == prev || max_off[0] < (aw[0]>>1) + (aw[0]>>2)) break;
}

end_left_extend:
// check whether we prefer to reach the end of the query
if (gscore <= 0 || gscore <= a->score - opt->pen_clip5) { // local extension
a->qb = s->qbeg - qle, a->rb = s->rbeg - tle;
Expand All @@ -748,6 +757,11 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
qe = s->qbeg + s->len;
re = s->rbeg + s->len - rmax[0];
assert(re >= 0);
if (opt->flag & MEM_F_FASTEXT) {
a->score = intel_extend(l_query - qe, (uint8_t*)query + qe, rmax[1] - rmax[0] - re, rseq + re, 5, opt->mat, opt->o_del, opt->e_del, aw[1], opt->pen_clip3, opt->zdrop, sc0, &qle, &tle, &gtle, &gscore, &max_off[1]);
if (a->score != -1)
goto end_right_extend;
}
for (i = 0; i < MAX_BAND_TRY; ++i) {
int prev = a->score;
aw[1] = opt->w << i;
Expand All @@ -760,6 +774,8 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
if (bwa_verbose >= 4) { printf("*** Right extension: prev_score=%d; score=%d; bandwidth=%d; max_off_diagonal_dist=%d\n", prev, a->score, aw[1], max_off[1]); fflush(stdout); }
if (a->score == prev || max_off[1] < (aw[1]>>1) + (aw[1]>>2)) break;
}

end_right_extend:
// similar to the above
if (gscore <= 0 || gscore <= a->score - opt->pen_clip3) { // local extension
a->qe = qe + qle, a->re = rmax[0] + re + tle;
Expand All @@ -770,7 +786,6 @@ void mem_chain2aln(const mem_opt_t *opt, const bntseq_t *bns, const uint8_t *pac
}
} else a->qe = l_query, a->re = s->rbeg + s->len;
if (bwa_verbose >= 4) printf("*** Added alignment region: [%d,%d) <=> [%ld,%ld); score=%d; {left,right}_bandwidth={%d,%d}\n", a->qb, a->qe, (long)a->rb, (long)a->re, a->score, aw[0], aw[1]);

// compute seedcov
for (i = 0, a->seedcov = 0; i < c->n; ++i) {
const mem_seed_t *t = &c->seeds[i];
Expand Down
1 change: 1 addition & 0 deletions bwamem.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ typedef struct __smem_i smem_i;
#define MEM_F_PRIMARY5 0x800
#define MEM_F_KEEP_SUPP_MAPQ 0x1000
#define MEM_F_XB 0x2000
#define MEM_F_FASTEXT 0x4000

typedef struct {
int a, b; // match score and mismatch penalty
Expand Down
9 changes: 9 additions & 0 deletions check_avx2_support.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#include <stdio.h>
#include "hsw.c"
int main() {
if(can_use_intel_core_4th_gen_features())
printf("Yes\n");
else
printf("No\n");
return 0;
}
11 changes: 8 additions & 3 deletions fastmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "utils.h"
#include "bntseq.h"
#include "kseq.h"
#include "intel_ext.h"
KSEQ_DECLARE(gzFile)

extern unsigned char nst_nt4_table[256];
Expand Down Expand Up @@ -130,7 +131,7 @@ int main_mem(int argc, char *argv[])

aux.opt = opt = mem_opt_init();
memset(&opt0, 0, sizeof(mem_opt_t));
while ((c = getopt(argc, argv, "51qpaMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:")) >= 0) {
while ((c = getopt(argc, argv, "51qpafFMCSPVYjuk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:o:f:W:x:G:h:y:K:X:H:")) >= 0) {
if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1;
else if (c == '1') no_mt_io = 1;
else if (c == 'x') mode = optarg;
Expand Down Expand Up @@ -226,8 +227,10 @@ int main_mem(int argc, char *argv[])
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n",
__func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low);
}
else return 1;
} else if (c == 'F') {
opt->flag |= MEM_F_FASTEXT;
intel_init();
} else return 1;
}

if (rg_line) {
Expand All @@ -253,6 +256,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw);
fprintf(stderr, " -S skip mate rescue\n");
fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n");
fprintf(stderr, " -F Use Intel's filter and extend \n");
fprintf(stderr, "\nScoring options:\n\n");
fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a);
fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b);
Expand All @@ -269,6 +273,7 @@ int main_mem(int argc, char *argv[])
fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n");
fprintf(stderr, " -H STR/FILE insert STR to header if it starts with @; or insert lines in FILE [null]\n");
fprintf(stderr, " -o FILE sam file to output results to [stdout]\n");
fprintf(stderr, " -f FILE sam file to output results to [stdout]\n");
fprintf(stderr, " -j treat ALT contigs as part of the primary assembly (i.e. ignore <idxbase>.alt file)\n");
fprintf(stderr, " -5 for split alignment, take the alignment with the smallest coordinate as primary\n");
fprintf(stderr, " -q don't modify mapQ of supplementary alignments\n");
Expand Down
80 changes: 80 additions & 0 deletions hsw.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#include <stdint.h>


void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd) {
uint32_t ebx, edx;

__asm__("cpuid" : "+b" (ebx), "+a" (eax), "+c" (ecx), "=d" (edx));
abcd[0] = eax;
abcd[1] = ebx;
abcd[2] = ecx;
abcd[3] = edx;
}

int check_xcr0_ymm() {
uint32_t xcr0;
__asm__("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx");
return ((xcr0 & 6) == 6); /* checking if xmm and ymm state are enabled in XCR0 */
}

int check_4th_gen_intel_core_features() {
uint32_t abcd[4];
uint32_t fma_movbe_osxsave_mask = ((1 << 12) | (1 << 22) | (1 << 27));
uint32_t avx2_bmi12_mask = (1 << 5) | (1 << 3) | (1 << 8);
/* CPUID.(EAX=01H, ECX=0H):ECX.FMA[bit 12]==1 &&
CPUID.(EAX=01H, ECX=0H):ECX.MOVBE[bit 22]==1 &&
CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 */
run_cpuid(1, 0, abcd);
if ((abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask)
return 0;
if (!check_xcr0_ymm())
return 0;
/* CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1 &&
CPUID.(EAX=07H, ECX=0H):EBX.BMI1[bit 3]==1 &&
CPUID.(EAX=07H, ECX=0H):EBX.BMI2[bit 8]==1 */
run_cpuid(7, 0, abcd);
if ((abcd[1] & avx2_bmi12_mask) != avx2_bmi12_mask)
return 0;
/* CPUID.(EAX=80000001H):ECX.LZCNT[bit 5]==1 */
run_cpuid(0x80000001, 0, abcd);
if ((abcd[2] & (1 << 5)) == 0)
return 0;
return 1;
}


static int can_use_intel_core_4th_gen_features() {
static int the_4th_gen_features_available = -1;
/* test is performed once */
if (the_4th_gen_features_available < 0)
the_4th_gen_features_available = check_4th_gen_intel_core_features();
return the_4th_gen_features_available;
}

int is_cpuid_ecx_bit_set(int eax, int bitidx) {
int ecx = 0, edx = 0, ebx = 0;
__asm__("cpuid"
: "=b" (ebx),
"=c" (ecx),
"=d" (edx)
: "a" (eax)
);
return (((ecx >> bitidx)&1) == 1);
}

int is_sse42_supported() {
return is_cpuid_ecx_bit_set(1, 20);
}

/*
#include <stdio.h>
int main(int argc, char** argv)
{
if ( can_use_intel_core_4th_gen_features() )
printf("This CPU supports ISA extensions introduced in Haswell\n");
else
printf("This CPU does not support all ISA extensions introduced in Haswell\n");
return 1;
}
*/

Loading
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载