+
Skip to content

mmap'ed access to reference files #40

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
48 changes: 37 additions & 11 deletions bwa.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "utils.h"
#include "kstring.h"
#include "kvec.h"
#include "bwt.h"

#ifdef USE_MALLOC_WRAPPERS
# include "malloc_wrap.h"
Expand Down Expand Up @@ -231,7 +232,7 @@ char *bwa_idx_infer_prefix(const char *hint)
}
}

bwt_t *bwa_idx_load_bwt(const char *hint)
bwt_t *bwa_idx_load_bwt(const char *hint, int use_mmap)
{
char *tmp, *prefix;
bwt_t *bwt;
Expand All @@ -242,14 +243,22 @@ bwt_t *bwa_idx_load_bwt(const char *hint)
}
tmp = calloc(strlen(prefix) + 5, 1);
strcat(strcpy(tmp, prefix), ".bwt"); // FM-index
bwt = bwt_restore_bwt(tmp);
bwt = bwt_restore_bwt(tmp, use_mmap);
strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA)
bwt_restore_sa(tmp, bwt);
bwt_restore_sa(tmp, bwt, use_mmap);
free(tmp); free(prefix);
return bwt;
}

bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
void* bwa_load_pac_mmap(const char* prefix)
{
char pac_filename[1024];
strcat(strcpy(pac_filename, prefix), ".pac");

return bwt_ro_mmap_file(pac_filename, 0);
}

bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which, int use_mmap)
{
bwaidx_t *idx;
char *prefix;
Expand All @@ -259,7 +268,7 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
return 0;
}
idx = calloc(1, sizeof(bwaidx_t));
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint);
if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint, use_mmap);
if (which & BWA_IDX_BNS) {
int i, c;
idx->bns = bns_restore(prefix);
Expand All @@ -268,8 +277,14 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
if (bwa_verbose >= 3)
fprintf(stderr, "[M::%s] read %d ALT contigs\n", __func__, c);
if (which & BWA_IDX_PAC) {
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
if (use_mmap) {
idx->pac_mmap = bwa_load_pac_mmap(prefix);
idx->pac = (uint8_t*)idx->pac_mmap;
}
else {
idx->pac = calloc(idx->bns->l_pac/4+1, 1);
err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence
}
err_fclose(idx->bns->fp_pac);
idx->bns->fp_pac = 0;
}
Expand All @@ -278,22 +293,33 @@ bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which)
return idx;
}

bwaidx_t *bwa_idx_load(const char *hint, int which)
bwaidx_t *bwa_idx_load(const char *hint, int which, int use_mmap)
{
return bwa_idx_load_from_disk(hint, which);
return bwa_idx_load_from_disk(hint, which, use_mmap);
}

void bwa_idx_destroy(bwaidx_t *idx)
{
if (idx == 0) return;
if (idx->mem == 0) {
if (idx->bwt) bwt_destroy(idx->bwt);
if (idx->pac) {
if (idx->pac_mmap) {
fprintf(stderr, "Unmapping idx->pac_mmap\n");
bwt_unmap_file(idx->pac_mmap, idx->bns->l_pac/4+1);
}
else
free(idx->pac);
}
idx->pac = NULL;
if (idx->bns) bns_destroy(idx->bns);
if (idx->pac) free(idx->pac);
} else {
}
else { // shm
free(idx->bwt); free(idx->bns->anns); free(idx->bns);
if (!idx->is_shm) free(idx->mem);
idx->mem = 0;
}

free(idx);
}

Expand Down
8 changes: 5 additions & 3 deletions bwa.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ typedef struct {
int is_shm;
int64_t l_mem;
uint8_t *mem;

void *pac_mmap; // ptr to memory-mapped pac file (if mmap is being used)
} bwaidx_t;

typedef struct {
Expand All @@ -42,11 +44,11 @@ extern "C" {
uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM);

char *bwa_idx_infer_prefix(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint);
bwt_t *bwa_idx_load_bwt(const char *hint, int use_mmap);

bwaidx_t *bwa_idx_load_from_shm(const char *hint);
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which);
bwaidx_t *bwa_idx_load(const char *hint, int which);
bwaidx_t *bwa_idx_load_from_disk(const char *hint, int which, int use_mmap);
bwaidx_t *bwa_idx_load(const char *hint, int which, int use_mmap);
void bwa_idx_destroy(bwaidx_t *idx);
int bwa_idx2mem(bwaidx_t *idx);
int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx);
Expand Down
1 change: 1 addition & 0 deletions bwamem.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ typedef struct {
int max_matesw; // perform maximally max_matesw rounds of mate-SW for each end
int max_XA_hits, max_XA_hits_alt; // if there are max_hits or fewer, output them all
int8_t mat[25]; // scoring matrix; mat[0] == 0 if unset
int use_mmap; // use mmap to access index files
} mem_opt_t;

typedef struct {
Expand Down
8 changes: 4 additions & 4 deletions bwape.c
Original file line number Diff line number Diff line change
Expand Up @@ -271,8 +271,8 @@ int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bw
buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t));

if (_bwt == 0) { // load forward SA
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str, 0);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt, 0);
} else bwt = _bwt;

// SE
Expand Down Expand Up @@ -661,8 +661,8 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f
ks[1] = bwa_open_reads(opt.mode, fn_fa[1]);
{ // for Illumina alignment only
if (popt->is_preload) {
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str, 0);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt, 0);
pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1);
err_rewind(bns->fp_pac);
err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac);
Expand Down
4 changes: 2 additions & 2 deletions bwase.c
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,8 @@ void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_se
char str[1024];
bwt_t *bwt;
// load forward SA
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt);
strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str, 0);
strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt, 0);
for (i = 0; i != n_seqs; ++i) {
bwa_seq_t *p = &seqs[i];
bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr);
Expand Down
2 changes: 1 addition & 1 deletion bwashm.c
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ int main_shm(int argc, char *argv[])
if (optind < argc) {
if (bwa_shm_test(argv[optind]) == 0) {
bwaidx_t *idx;
idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL);
idx = bwa_idx_load_from_disk(argv[optind], BWA_IDX_ALL, 0);
if (bwa_shm_stage(idx, argv[optind], tmpfn) < 0) {
fprintf(stderr, "[E::%s] failed to stage the index in shared memory\n", __func__);
ret = 1;
Expand Down
158 changes: 155 additions & 3 deletions bwt.c
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@
# include "malloc_wrap.h"
#endif

#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>

void bwt_gen_cnt_table(bwt_t *bwt)
{
int i, j;
Expand Down Expand Up @@ -382,6 +388,58 @@ int bwt_seed_strategy1(const bwt_t *bwt, int len, const uint8_t *q, int x, int m
* Read/write BWT and SA *
*************************/

/*
* Create a read-only memory mapping of file fn that is locked in memory.
* If size > 0, then the file will be mapped only up to `size`; else the
* entire file will be mapped.
*
* \return Pointer to memory map; aborts in case of error.
*/
void* bwt_ro_mmap_file(const char *fn, size_t size) {
int fd = open(fn, O_RDONLY);
xassert(fd > -1, "Cannot open file");

struct stat buf;
int s = fstat(fd, &buf);
xassert(s > -1, "cannot stat file");

off_t st_size = buf.st_size;
if (size > 0) {
xassert(st_size >= size, "bad file size");
st_size = size;
}

// mmap flags:
// MAP_PRIVATE: copy-on-write mapping. Writes not propagated to file.
// MAP_POPULATE: prefault page tables for mapping. Use read-ahead. Only supported for MAP_PRIVATE
// MAP_HUGETLB: use huge pages. Manual says it's only supported since kernel ver. 2.6.32
// and requires special system configuration.
// MAP_NORESERVE: don't reserve swap space
// MAP_LOCKED: Lock the pages of the mapped region into memory in the manner of mlock(2)
// Because we try to lock the pages in memory, this call will fail if the system
// doesn't have sufficient physical memory. However, without locking, if the
// system can't quite fit the reference the call to mmap will succeed but aligning
// will take forever as parts of the reference are evicted and/or reloaded from disk.
int map_flags = MAP_PRIVATE | MAP_POPULATE | MAP_NORESERVE | MAP_LOCKED;
fprintf(stderr, "mmapping file %s (%0.1fMB)\n", fn, ((double)st_size) / (1024*1024));
void* m = mmap(0, st_size, PROT_READ, map_flags, fd, 0);
if (m == MAP_FAILED) {
perror(__func__);
err_fatal("Failed to map %s file to memory\n", fn);
}
fprintf(stderr, "File %s locked in memory\n", fn);
close(fd);
// MADV_WILLNEED: Expect access in the near future
madvise(m, st_size, MADV_WILLNEED);
return m;
}

void bwt_unmap_file(void* map, size_t map_size)
{
if (munmap(map, map_size) < 0)
perror(__func__);
}

void bwt_dump_bwt(const char *fn, const bwt_t *bwt)
{
FILE *fp;
Expand Down Expand Up @@ -418,7 +476,7 @@ static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a)
return offset;
}

void bwt_restore_sa(const char *fn, bwt_t *bwt)
void bwt_restore_sa_std(const char *fn, bwt_t *bwt)
{
char skipped[256];
FILE *fp;
Expand All @@ -440,7 +498,50 @@ void bwt_restore_sa(const char *fn, bwt_t *bwt)
err_fclose(fp);
}

bwt_t *bwt_restore_bwt(const char *fn)
void bwt_restore_sa_mmap(const char *fn, bwt_t *bwt)
{
/***************
* File layout:
* primary: bwtint_t
* skipped: 4*bwtint_t
* sa_intv: bwtint_t
* seq_len: bwtint_t
* suffix_array: bwtint_t[]
*/
bwt->sa_mmap = bwt_ro_mmap_file(fn, 0);

bwtint_t* array = (bwtint_t*)bwt->sa_mmap;

bwtint_t tmp;
tmp = array[0];
xassert(tmp == bwt->primary, "SA-BWT inconsistency: primary is not the same.");

bwt->sa_intv = array[5];
tmp = array[6];
xassert(tmp == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same.");

bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv;
bwt->sa = array + 6;

// Allow write permission. Note that the mapping is private so we won't ever
// propagate any changes to the actual file.
size_t protected_length = (void*)bwt->sa + 1 - bwt->sa_mmap;
if (mprotect(bwt->sa_mmap, protected_length, PROT_READ | PROT_WRITE) < 0)
perror_fatal(__func__, "Failed to allow write access to mmaped area\n");
bwt->sa[0] = -1;
// Remove write permission to the memory map
mprotect(bwt->sa_mmap, protected_length, PROT_READ);
}

void bwt_restore_sa(const char *fn, bwt_t *bwt, int use_mmap)
{
if (use_mmap)
bwt_restore_sa_mmap(fn, bwt);
else
bwt_restore_sa_std(fn, bwt);
}

static bwt_t *bwt_restore_bwt_std(const char *fn)
{
bwt_t *bwt;
FILE *fp;
Expand All @@ -461,9 +562,60 @@ bwt_t *bwt_restore_bwt(const char *fn)
return bwt;
}

static bwt_t *bwt_restore_bwt_mmap(const char* fn)
{
bwt_t *bwt;
bwt = (bwt_t*)calloc(1, sizeof(bwt_t));

void* m = bwt_ro_mmap_file(fn, 0);

struct stat buf;
int s = stat(fn, &buf);
xassert(s > -1, "cannot stat file");

bwt->bwt_mmap = m;
bwt->bwt_size = (buf.st_size - sizeof(bwtint_t)*5) >> 2;
bwt->primary = ((bwtint_t*)m)[0];
int i;
for(i = 1; i < 5; ++i) {
bwt->L2[i] = ((bwtint_t*)m)[i];
}
bwt->bwt = (uint32_t*) ((bwtint_t*)m + 5);
bwt->seq_len = bwt->L2[4];
bwt_gen_cnt_table(bwt);

return bwt;
}

bwt_t *bwt_restore_bwt(const char *fn, int use_mmap)
{
if (use_mmap)
return bwt_restore_bwt_mmap(fn);
else
return bwt_restore_bwt_std(fn);
}

void bwt_destroy(bwt_t *bwt)
{
if (bwt == 0) return;
free(bwt->sa); free(bwt->bwt);

if (bwt->bwt_mmap) {
fprintf(stderr, "Unmapping bwt->bwt_mmap\n");
bwt_unmap_file(bwt->bwt_mmap, bwt->bwt_size * sizeof(bwt->bwt[0]));
fprintf(stderr, "done\n");
}
else {
free(bwt->bwt);
}

if (bwt->sa_mmap) {
fprintf(stderr, "Unmapping bwt->sa_mmap\n");
bwt_unmap_file(bwt->sa_mmap, (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv);
fprintf(stderr, "done\n");
}
else {
free(bwt->sa);
}

free(bwt);
}
Loading
点击 这是indexloc提供的php浏览器服务,不要输入任何密码和下载