A research fellow at Harvard asked me to write a program to search for gene sequence, such as ‘TCC’, and record the next 4 codes. The data file is 14Gb. He tried some matlab codes, and the system freezes, or keeps running and never stops.
I first tested using a loop method (V1.0). It turns out it will take a month to finish 14Gb data on my 1.8GHz Core 2 Duo/3Gb RAM PC. Then I updated it to use matrix. It turns out it will only take 1.3 hours on my 1.8Gb PC or 40 minutes on my 2.33GHz Core 2 Duo/2Gb RAM PC. It beats any codes that he got using Python or other language.
Source codes are shown as follows, where v1.0 is using loop and slow for large data file, it records wanted codes and target sequence positions; v2.0 and v2.1 record wanted codes only without positions; v2.2 and v2.3 records both codes and positions.
The latest version of the program is also posted on mathworks: http://www.mathworks.com/matlabcentral/fileexchange/31966-fast-gene-sequence-search-for-very-large-data-file
%———————————– seqSearch.m V1.0 —————————————
function [p,s]=seqSearch(f,targSeq,lc,offset,M,print)
%[p,s] = seqSearch(f,targSeq,lc,offset,M,print)
%[p,s] = seqSearch(f,'','','','','') with all default parameters.
%
% Gene Sequence Search V1.0 - search for a specific sequence targSeq and record
% a neighboring code sequence with an offset position.
%
%Input:
% f: data file path, format: 'filepath', e.g. 'C:\sample.seq'
% targSeq: searching target sequence, by default 'TCC'
% lc: length of the code to record, by default 4
% offset: number of offset positons from the search sequence to record the
% code. e.g., offset is 1 for the first one after the last code (to the
% right), and negative value means before the first code (to the left).
% by default 1
% M: length of the chunk of the codes to read into memory each time,
% by default 10E6
% print: display first one in every "print" number of found codes,
% by default 50
%
%Output:
% p: positions of the target sequence found (first code of the sequence)
% s: the recorded codes
%
% Binlin Wu - CCNY
% bwu@sci.ccny.cuny.edu
% Jun 10th, 2011
% targetSeq = 'TCC';
% M = 1e6;
switch nargin
case 0
error('Parameters needed: seqSearch(f,targSeq,lc,offset,M,print)')
case 1
targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; print = 100;
case 2
lc = 4; offset = 1; M = 1e6; print = 100;
case 3
offset = 1; M = 1e6; print = 100;
case 4
M = 1e6; print = 100;
case 5
print = 100;
case 6
otherwise
error('too many parameters: seqSearch(f,targSeq,lc,offset,M,print)')
end
if isempty(targSeq)
targSeq = 'TCC';
end
if isempty(lc)
lc = 4;
end
if isempty(offset)
offset = 1;
end
if isempty(M)
M = 1e6;
end
if isempty(print)
print = 100;
end
% if nargin < 5
% error('More parameters needed, seqSearch(f,targSeq,lc,offset,M,...)')
% elseif nargin==5
% print = 50;
% end
lt = length(targSeq); %length of the search target sequence
fclose('all');
% f = 'C:\Documents and Settings\Bin\Desktop\sample1.txt';
fid = fopen(f,'r');
% ----- initialization -------
k = 0;
n = 0;
p = [];
s = '';
C0 = '';
fprintf('Start searching ...\n')
if offset > 0
lMin = lt+offset+lc-1; %the least codes possible to have both a target sequence and the codes to record
if M < lMin
errMsg = sprintf('M must be larger than %d',lMin);
error(errMsg);
end
C = textscan(fid,'%c',M,'HeaderLines',1);
C = C{1};
if length(C) > 0 && length(C) < lMin %too short, length < lMin
for i = 1:length(C)-lt+1
if ~isequal(C(i:i+lt-1)',targSeq)
continue
end
k = k+1;
if print==1 || mod(k,print)==1
fprintf(' - %dth %s sequence found\n',k,targSeq);
end
p(k) = i;
s(k,:) = char(zeros(1,lc));
end
elseif length(C) >= lMin && length(C) < M %only read once, lMin <= length < M
i_END = length(C) - (lMin - 1);
for i=1:i_END
if ~isequal(C(i:i+lt-1)',targSeq)
continue
end
k = k+1;
if print==1 || mod(k,print)==1
fprintf(' - %dth %s sequence found\n',k,targSeq);
end
p(k) = i;
s(k,:) = C(i+lt-1+offset:i+lt-1+offset+lc-1);
end
C0 = C(i_END+1:end);
for i = 1:length(C0)-lt+1
if ~isequal(C0(i:i+lt-1)',targSeq)
continue
end
k = k+1;
if print==1 || mod(k,print)==1
fprintf(' - %dth %s sequence found\n',k,targSeq);
end
p(k) = i_END+i;
s(k,:) = char(zeros(1,lc));
end
elseif length(C) == M %data is no less than M codes
% M = M+offset+lc-1;
n = 1;
fprintf('\n In the %dth %d codes: \n',n,M);
i_END = length(C) - (lMin - 1);
for i=1:i_END
if ~isequal(C(i:i+lt-1)',targSeq)
continue
end
k = k+1;
if print==1 || mod(k,print)==1
fprintf(' - %dth %s sequence found\n',k,targSeq);
end
p(k) = i;
% if i+lt-1+offset+lc-1 > M && length(C) < M+length(C0) %run to the end of the data
% s{k} = [];
% end
s(k,:) = C(i+lt-1+offset:i+lt-1+offset+lc-1);
end
C0 = C(i_END+1:end);
i_END = length(C);
while(1)
C = textscan(fid,'%c',M);
if isempty(C{1})
break
end
n = n+1;
i_END = length(C{1});
fprintf('\n In the %dth %d codes: \n',n,M);
C = [C0;C{1}];
% i_END = length(C) - (lMin - 1);
for i=1:i_END
if ~isequal(C(i:i+lt-1)',targSeq)
continue
end
k = k+1;
if print==1 || mod(k,print)==1
fprintf(' - %dth %s sequence found\n',k,targSeq);
end
p(k)=(n-1)*M - (lMin - 1) + i;
% if i+lt-1+offset+lc-1 > M && length(C) < M+length(C0) %run to the end of the data
% s{k} = [];
% end
s(k,:) = C(i+lt-1+offset:i+lt-1+offset+lc-1);
end
C0 = C(i_END+1:end);
end
for i = 1:length(C0)-lt+1 % rest lMin-1 codes at the end
if ~isequal(C0(i:i+lt-1)',targSeq)
continue
end
k = k+1;
if print==1 || mod(k,print)==1
fprintf(' - %dth %s sequence found\n',k,targSeq);
end
if i_END == M % exactly n * M codes & target sequence found in the ending lMin-1 codes
p(k)=n*M - (lMin - 1) + i;
else
p(k)=(n-1)*M - (lMin - 1) + i_END + i;
end
s(k,:) = char(zeros(1,lc));
end
end
else % when offset is negtive
lMin = lt-offset; %the least codes possible to have both a target sequence and the codes to record
if M < lMin
errMsg = sprintf('M must be larger than %d',lMin);
error(errMsg);
end
if -offset < lc
error(' |offset| can not be smaller than lc, when offset < 0');
end
C = textscan(fid,'%c',M,'HeaderLines',1);
C = C{1};
if length(C) > 0 && length(C) < lMin %too short, length < lMin
for i = 1:length(C)-lt+1
if ~isequal(C(i:i+lt-1)',targSeq)
continue
end
k = k+1;
if print==1 || mod(k,print)==1
fprintf(' - %dth %s sequence found\n',k,targSeq);
end
p(k) = i;
s(k,:) = char(zeros(1,lc));
end
elseif length(C) >= lMin && length(C) < M %only read once, lMin <= length < M
for i = 1:(lMin-1)-lt+1
if ~isequal(C(i:i+lt-1)',targSeq)
continue
end
k = k+1;
if print==1 || mod(k,print)==1
fprintf(' - %dth %s sequence found\n',k,targSeq);
end
p(k) = i;
s(k,:) = char(zeros(1,lc));
end
i_BEGIN = i+1; %lMin-lt+1;
for i=i_BEGIN:length(C)-lt+1
if ~isequal(C(i:i+lt-1)',targSeq)
continue
end
k = k+1;
if print==1 || mod(k,print)==1
fprintf(' - %dth %s sequence found\n',k,targSeq);
end
p(k) = i;
s(k,:) = C(i+offset : i+offset+lc-1);
end
elseif length(C) == M %data is no less than M codes
% M = M+offset+lc-1;
n = 1;
fprintf('\n In the %dth %d codes: \n',n,M);
i_END = lMin - lt; %(lMin-1)-lt+1 ;
for i = 1:i_END %(lMin-1)-lt+1
if ~isequal(C(i:i+lt-1)',targSeq)
continue
end
k = k+1;
if print==1 || mod(k,print)==1
fprintf(' - %dth %s sequence found\n',k,targSeq);
end
p(k) = i;
s(k,:) = char(zeros(1,lc));
end
i_BEGIN = i_END + 1; %lMin-lt+1;
i_END = M - lt + 1;
for i=i_BEGIN:i_END %length(C)-lt+1
if ~isequal(C(i:i+lt-1)',targSeq)
continue
end
k = k+1;
if print==1 || mod(k,print)==1
fprintf(' - %dth %s sequence found\n',k,targSeq);
end
p(k) = i;
s(k,:) = C(i+offset:i+offset+lc-1);
end
C0 = C(end-(lMin-1)+1:end);
while(1)
C = textscan(fid,'%c',M);
if isempty(C{1})
break
end
n = n+1;
fprintf('\n In the %dth %d codes: \n',n,M);
C = [C0;C{1}];
i_BEGIN = lMin-lt+1;
i_END = length(C)-lt+1;
for i=i_BEGIN:i_END
if ~isequal(C(i:i+lt-1)',targSeq)
continue
end
k = k+1;
if print==1 || mod(k,print)==1
fprintf(' - %dth %s sequence found\n',k,targSeq);
end
p(k)=(n-1)*M - (lMin - 1) + i;
s(k,:) = C(i+offset:i+offset+lc-1);
end
C0 = C(end-(lMin-1)+1:end);
end
end
end
fprintf('\n Searching finished! \n',n,M);
fclose(fid);
save seqSearch.mat p s
%———————————– seqSearch.m V2.0 —————————————
function seq1 = seqSearch(f,targSeq,lc,offset,M,sFlag,fs)
%seq1 = seqSearch2(f,targSeq,lc,offset,M,sFlag,fs)
%seq1 = seqSearch2(f) with all default parameters.
%seq1 = seqSearch2(f,'','','',1e7) with all default parameters except M
%** If the dataset is very large, NOT using output is highly recommended.
%
% Gene Sequence Search V2.0 - search for a specific sequence and record
% a neighboring code sequence with an offset position using matrix for
% large datasets.
%
%Input:
% f: data file path, format: 'filepath', e.g. 'C:\sample.seq'
% targSeq: search target sequence, by default 'TCC'
% lc: length of the codes to record, by default 4
% offset: number of offset positons from the search sequence to record the
% code. e.g., offset is 1 for the first one after the last code (to the
% right), and negative value means before the first code (to the left).
% by default 1
% M: length of the chunk of the codes to read into memory each time,
% by default 10E6
% sFlag: 1 for save, 0 for not save, by default 0
% fs: filename to save, only valid when sFlag = 1, e.g. 'seq.txt'
%
%Output:
% seq1: the recorded codes
%
% Binlin Wu - CCNY
% bwu@sci.ccny.cuny.edu
% Jun 10th, 2011
% set default values for the parameters
switch nargin
case 0
error('Parameters needed: seqSearch(f,targSeq,lc,offset,M,print)')
case 1
targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; sFlag = 0;
case 2
lc = 4; offset = 1; M = 1e6; sFlag = 0;
case 3
offset = 1; M = 1e6; sFlag = 0;
case 4
M = 1e6; sFlag = 0;
case 5
sFlag = 0;
case 6
if sFlag == 1
error('save destination file name missing');
end
case 7
if isequal(sFlag,1) && isempty(fs)
error('save destination file name missing');
end
otherwise
error('too many parameters: seqSearch(f,targSeq,lc,offset,M,print)')
end
if isempty(targSeq)
targSeq = 'TCC';
end
if isempty(lc)
lc = 4;
end
if isempty(offset)
offset = 1;
end
if isempty(M)
M = 1e6;
end
if isempty(sFlag)
sFlag = 0;
end
% ---- existing save file overwritten confirm ---
confirm = 0;
if sFlag == 1 && exist(fs,'file') == 2
while(1)
if confirm == 1
break
end
if sFlag == 1 && exist(fs,'file') == 2
fprintf('Filename %s already exists!\n',fs);
reply = upper(input('Do you want to overwrite it? Y/N [N]: ', 's'));
if isempty(reply)
reply = 'N';
end
if ~isequal(reply,'Y') && ~isequal(reply,'N')
confirm = 0;
elseif isequal(reply,'Y')
confirm = 1;
else
fs1 = input('Please enter a new file name: ', 's');
fprintf('Codes will be recorded in file: %s\n', fs1);
reply = upper(input('Confirm? Y/N [Y]: ', 's'));
if isempty(reply)
reply = 'Y';
end
if isequal(reply,'Y')
confirm = 1;
fs = fs1;
else
confirm = 0;
end
if exist(fs1,'file') == 2
confirm = 0;
end
end
end
end
end
% --------------------------------------
lt = length(targSeq); %length of the search target sequence
fclose('all');
% f = '.\sample.seq';
fid = fopen(f,'r');
if sFlag == 1
sfid = fopen(fs,'w');
end
% ----- initialization -------
n = 0;
C0 = '';
seq1 = '';
fprintf('Start searching ...\n')
lMin = lt+offset+lc-1; %the least codes possible to have both a target sequence and the codes to record
if M < lMin
errMsg = sprintf('M must be larger than %d',lMin);
error(errMsg);
end
C = textscan(fid,'%c',0,'HeaderLines',1);
while(1)
C = textscan(fid,'%c',M);
if isempty(C{1})
break
end
n = n+1;
fprintf('\n Checking the %dth %d codes ... \n',n,M);
C = [C0;C{1}];
i_END = length(C) - lMin + lt;
C1 = [C(1:i_END) C(2:i_END+1) C(3:i_END+2)];
SEQ = repmat('TCC',i_END,1);
seqPos = C1 == SEQ;
seqPos = seqPos(:,1)&seqPos(:,2)&seqPos(:,3);
seqPos1 = logical( ...
[0; 0; 0; seqPos(1:end); 0] + ...
[0; 0; 0; 0; seqPos(1:end)] + ...
[0; 0; 0; 0; 0; seqPos(1:end-1)] + ...
[0; 0; 0; 0; 0; 0; seqPos(1:end-2)]);
if sFlag == 1
fprintf(sfid,'%s',C(seqPos1));
end
if nargout == 1
seq1 = [seq1;C(seqPos1)];
end
fprintf(' The %dth %d codes done. \n',n,M);
C0 = C(end - (lMin - 1) + 1:end);
end
fprintf('\n Searching finished! \n',n,M);
fclose('all');
%———————————– seqSearch.m V2.1 —————————————
function seq1 = seqSearch(f,targSeq,lc,offset,M,sFlag,fs)
%seq1 = seqSearch2(f,targSeq,lc,offset,M,sFlag,fs)
%seq1 = seqSearch2(f) with all default parameters.
%seq1 = seqSearch2(f,'','','',1e7) with all default parameters except M
%** If the dataset is very large, NOT using output is highly recoomended.
%
% Gene Sequence Search V2.1 - search for a specific sequence and record
% a neighboring code sequence with an offset position using matrix for
% large datasets.
%
%Input:
% f: data file path, format: 'filepath', e.g. 'C:\sample.seq'
% targSeq: search target sequence, by default 'TCC'
% lc: length of the codes to record, by default 4
% offset: number of offset positons from the search sequence to record the
% code. e.g., offset is 1 for the first one after the last code (to the
% right), and negative value means before the first code (to the left).
% by default 1
% M: length of the chunk of the codes to read into memory each time,
% by default 10E6
% sFlag: 1 for save, 0 for not save, by default 0
% fs: filename to save, only valid when sFlag = 1, e.g. 'seq.txt'
%
%Output:
% seq1: the recorded codes
%
% Binlin Wu - CCNY
% bwu@sci.ccny.cuny.edu
% Jun 10th, 2011
% set default values for the parameters
switch nargin
case 0
error('Parameters needed: seqSearch(f,targSeq,lc,offset,M,print)')
case 1
targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; sFlag = 0;
case 2
lc = 4; offset = 1; M = 1e6; sFlag = 0;
case 3
offset = 1; M = 1e6; sFlag = 0;
case 4
M = 1e6; sFlag = 0;
case 5
sFlag = 0;
case 6
if sFlag == 1
error('save destination file name missing');
end
case 7
if isequal(sFlag,1) && isempty(fs)
error('save destination file name missing');
end
otherwise
error('too many parameters: seqSearch(f,targSeq,lc,offset,M,print)')
end
if isempty(targSeq)
targSeq = 'TCC';
end
if isempty(lc)
lc = 4;
end
if isempty(offset)
offset = 1;
end
if isempty(M)
M = 1e6;
end
if isempty(sFlag)
sFlag = 0;
end
% ---- existing save file overwritten confirm ---
confirm = 0;
if sFlag == 1 && exist(fs,'file') == 2
while(1)
if confirm == 1
break
end
if sFlag == 1 && exist(fs,'file') == 2
fprintf('Filename %s already exists!\n',fs);
reply = upper(input('Do you want to overwrite it? Y/N [N]: ', 's'));
if isempty(reply)
reply = 'N';
end
if ~isequal(reply,'Y') && ~isequal(reply,'N')
confirm = 0;
elseif isequal(reply,'Y')
confirm = 1;
else
fs1 = input('Please enter a new file name: ', 's');
fprintf('Codes will be recorded in file: %s\n', fs1);
reply = upper(input('Confirm? Y/N [Y]: ', 's'));
if isempty(reply)
reply = 'Y';
end
if isequal(reply,'Y')
confirm = 1;
fs = fs1;
else
confirm = 0;
end
if exist(fs1,'file') == 2
confirm = 0;
end
end
end
end
end
% --------------------------------------
lt = length(targSeq); %length of the search target sequence
fclose('all');
% f = '.\sample.seq';
fid = fopen(f,'r');
if sFlag == 1
sfid = fopen(fs,'w');
end
% ----- initialization -------
n = 0;
C0 = '';
seq1 = '';
fprintf('Start searching ...\n')
lMin = lt+offset+lc-1; %the least codes possible to have both a target sequence and the codes to record
if M < lMin
errMsg = sprintf('M must be larger than %d',lMin);
error(errMsg);
end
C = textscan(fid,'%c',0,'HeaderLines',1);
while(1)
C = textscan(fid,'%c',M);
if isempty(C{1})
break
end
n = n+1;
fprintf('\n Checking the %dth %d codes ... \n',n,M);
C = [C0;C{1}];
% i_END = length(C) - lMin + lt;
%
% C1 = [C(1:i_END) C(2:i_END+1) C(3:i_END+2)];
% SEQ = repmat('TCC',i_END,1);
% seqPos = C1 == SEQ;
% seqPos = seqPos(:,1)&seqPos(:,2)&seqPos(:,3);
% seqPos1 = logical( ...
% [0; 0; 0; seqPos(1:end); 0] + ...
% [0; 0; 0; 0; seqPos(1:end)] + ...
% [0; 0; 0; 0; 0; seqPos(1:end-1)] + ...
% [0; 0; 0; 0; 0; 0; seqPos(1:end-2)]);
% seqPos = strfind(C(1:i_END)','TCC');
% seqPos1 = [seqPos+lt+offset-1;seqPos+lt+offset;seqPos+lt+offset+1;seqPos+lt+offset+2];
seqPos = strfind(C(1:end-lMin + lt)',targSeq);
if ~isempty(seqPos)
seqPos1 = repmat(seqPos+lt+offset-1,lc,1);
seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos));
seqPos1 = seqPos1(:);
if sFlag == 1
fprintf(sfid,'%s',C(seqPos1));
end
if nargout == 1
seq1 = [seq1;C(seqPos1)];
end
end
fprintf(' The %dth %d codes done. \n',n,M);
C0 = C(end - (lMin - 1) + 1:end);
end
fprintf('\n Searching finished! \n',n,M);
fclose('all');
%———————————– seqSearch.m V2.2 —————————————
function [seq,seqPos] = seqSearch(f,targSeq,lc,offset,M,sFlag,fs)
%[seq,seqPos] = seqSearch2(f,targSeq,lc,offset,M,sFlag,fs)
%[seq,seqPos] = seqSearch2(f) with all default parameters.
%[seq,seqPos] = seqSearch2(f,'','','',1e7) with all default parameters except M
%** If the dataset is very large, NOT using output is highly recoomended.
%
% Gene Sequence Search V2.2 - search for a specific sequence and record
% a neighboring code sequence with an offset position using matrix for
% large datasets.
%
%Input:
% f: data file path, format: 'filepath', e.g. 'C:\sample.seq'
% targSeq: search target sequence, by default 'TCC'
% lc: length of the codes to record, by default 4
% offset: number of offset positons from the search sequence to record the
% code. e.g., offset is 1 for the first one after the last code (to the
% right), and negative value means before the first code (to the left).
% by default 1
% M: length of the chunk of the codes to read into memory each time,
% by default 10E6
% sFlag: 1 for save, 0 for not save, by default 0
% fs: filename to save, only valid when sFlag = 1, e.g. 'seq.txt'
%
%Output:
% seq: the recorded codes
% seqPos: positions of the target search sequence
%
% Binlin Wu - CCNY
% bwu@sci.ccny.cuny.edu
% Jun 10th, 2011
% set default values for the parameters
switch nargin
case 0
error('Parameters needed: seqSearch(f,targSeq,lc,offset,M,print)')
case 1
targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; sFlag = 0;
case 2
lc = 4; offset = 1; M = 1e6; sFlag = 0;
case 3
offset = 1; M = 1e6; sFlag = 0;
case 4
M = 1e6; sFlag = 0;
case 5
sFlag = 0;
case 6
if sFlag == 1
error('save destination file name missing');
end
case 7
if sFlag == 1 && isempty(fs)
error('save destination file name missing');
end
otherwise
error('too many parameters: seqSearch(f,targSeq,lc,offset,M,print)')
end
if isempty(targSeq)
targSeq = 'TCC';
end
if isempty(lc)
lc = 4;
end
if isempty(offset)
offset = 1;
end
if isempty(M)
M = 1e6;
end
if isempty(sFlag)
sFlag = 0;
end
% ---- existing save file overwritten confirm ---
confirm = 0;
if sFlag == 1 && exist(fs,'file') == 2
while(1)
if confirm == 1
break
end
if sFlag == 1 && exist(fs,'file') == 2
fprintf('Filename %s already exists!\n',fs);
reply = upper(input('Do you want to overwrite it? Y/N [N]: ', 's'));
if isempty(reply)
reply = 'N';
end
if ~isequal(reply,'Y') && ~isequal(reply,'N')
confirm = 0;
elseif isequal(reply,'Y')
confirm = 1;
else
fs1 = input('Please enter a new file name: ', 's');
fprintf('Codes will be recorded in file: %s\n', fs1);
reply = upper(input('Confirm? Y/N [Y]: ', 's'));
if isempty(reply)
reply = 'Y';
end
if isequal(reply,'Y')
confirm = 1;
fs = fs1;
else
confirm = 0;
end
if exist(fs1,'file') == 2
confirm = 0;
end
end
end
end
end
% ----- initialization -------
lt = length(targSeq); %length of the search target sequence
fclose('all');
% f = '.\sample.seq';
fid = fopen(f,'r');
if sFlag == 1
sfid = fopen(fs,'w');
end
n = 0;
C0 = '';
seq = '';
seqPos = [];
% ----- start search -------
fprintf('Start searching ...\n')
if offset > 0
lMin = lt+offset+lc-1; %the least codes possible to have both a target sequence and the codes to record
if M < lMin
errMsg = sprintf('M must be larger than %d',lMin);
error(errMsg);
end
C = textscan(fid,'%c',M,'HeaderLines',1);
C = C{1};
if isempty(C)
fprintf('\n The data file is empty!\n')
return
end
if length(C) > 0 && length(C) < lMin %too short, length < lMin
if nargout == 2
seqPos0 = strfind(C',targSeq);
if ~isempty(seqPos0)
seq = [seq;repmat(' ',lc*length(seqPos0),1)];
seqPos = [seqPos seqPos0];
end
end
elseif length(C) >= lMin && length(C) < M %only read once, lMin <= length < M
seqPos0 = strfind(C(1:end - (lMin - 1) + (lt-1))',targSeq);
if ~isempty(seqPos0)
seqPos1 = repmat(seqPos0+lt-1+offset,lc,1);
seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
seqPos1 = seqPos1(:);
if sFlag == 1
fprintf(sfid,'%s',C(seqPos1));
end
if nargout >= 1
seq = [seq;C(seqPos1)];
end
if nargout >= 2
seqPos = [seqPos seqPos0];
end
end
seqPos0 = strfind(C(end - (lMin - 1) + 1:end)',targSeq);
if ~isempty(seqPos0)
if nargout >= 1
seq = [seq;repmat(' ',lc*length(seqPos0),1)];
end
if nargout >= 2
seqPos = [seqPos seqPos0 + length(C) - (lMin - 1)];
end
end
elseif length(C) == M %data is no less than M codes
n=1;
LastChunkLength = length(C);
fprintf('\n Checking the %dth %d codes ... \n',n,M);
seqPos0 = strfind(C(1:end - (lMin - 1) + (lt-1))',targSeq);
if ~isempty(seqPos0)
seqPos1 = repmat(seqPos0+lt-1+offset,lc,1);
seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
seqPos1 = seqPos1(:);
if sFlag == 1
fprintf(sfid,'%s',C(seqPos1));
end
if nargout >= 1
seq = [seq;C(seqPos1)];
end
if nargout >= 2
seqPos = [seqPos seqPos0];
end
end
fprintf(' The %dth %d codes done. \n',n,M);
C0 = C(end - (lMin - 1) + 1:end);
while(1)
C = textscan(fid,'%c',M);
if isempty(C{1})
break
end
LastChunkLength = length(C{1});
n = n+1;
fprintf('\n Checking the %dth %d codes ... \n',n,M);
C = [C0;C{1}];
% C1 = [C(1:i_END) C(2:i_END+1) C(3:i_END+2)];
% SEQ = repmat('TCC',i_END,1);
% seqPos = C1 == SEQ;
% seqPos = seqPos(:,1)&seqPos(:,2)&seqPos(:,3);
% seqPos1 = logical( ...
% [0; 0; 0; seqPos(1:end); 0; 0; 0 ] + ...
% [0; 0; 0; 0; seqPos(1:end); 0; 0 ] + ...
% [0; 0; 0; 0; 0; seqPos(1:end); 0 ] + ...
% [0; 0; 0; 0; 0; 0;
% seqPos(1:end)] );
% seqPos = strfind(C(1:i_END)','TCC');
% seqPos1 = [seqPos+lt+offset-1;seqPos+lt+offset;seqPos+lt+offset+1;seqPos+lt+offset+2];
seqPos0 = strfind(C(1:end - (lMin - 1) + (lt-1))',targSeq);
if ~isempty(seqPos0)
seqPos1 = repmat(seqPos0+lt-1+offset,lc,1);
seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
seqPos1 = seqPos1(:);
if sFlag == 1
fprintf(sfid,'%s',C(seqPos1));
end
if nargout >= 1
seq = [seq;C(seqPos1)];
if nargout >= 2
seqPos = [seqPos seqPos0+(n-1)*M-(lMin-1)];
end
end
end
fprintf(' The %dth %d codes done. \n',n,M);
C0 = C(end - (lMin - 1) + 1:end);
end
if nargout == 2
seqPos0 = strfind(C0',targSeq);
if ~isempty(seqPos0)
seq = [seq;repmat(' ',lc*length(seqPos0),1)];
seqPos = [seqPos seqPos0+LastChunkLength+(n-1)*M-(lMin-1)];
end
end
end
else
lMin = lt-offset; %the least codes possible to have both a target sequence and the codes to record
if M < lMin
errMsg = sprintf('M must be larger than %d',lMin);
error(errMsg);
end
if -offset < lc
error(' |offset| can not be smaller than lc, when offset < 0');
end
C = textscan(fid,'%c',M,'HeaderLines',1);
C = C{1};
if isempty(C)
fprintf('\n The data file is empty!\n')
return
end
if length(C) > 0 && length(C) < lMin %too short, length < lMin
seqPos0 = strfind(C',targSeq);
if nargout == 2
if ~isempty(seqPos0)
seq = [seq;repmat(' ',lc*length(seqPos0),1)];
seqPos = seqPos0;
end
end
elseif length(C) >= lMin && length(C) < M %only read once, lMin <= length < M
seqPos0 = strfind(C(1:lMin-1)',targSeq);
if nargout == 2
if ~isempty(seqPos0)
seq = [seq;repmat(' ',lc*length(seqPos0),1)];
seqPos = seqPos0;
end
end
seqPos0 = strfind(C(-offset+1:end)',targSeq);
if ~isempty(seqPos0)
seqPos1 = repmat(seqPos0,lc,1);
seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
seqPos1 = seqPos1(:);
if sFlag == 1
fprintf(sfid,'%s',C(seqPos1));
end
if nargout >= 1
seq = [seq;C(seqPos1)];
if nargout >= 2
seqPos = [seqPos seqPos0-offset];
end
end
end
elseif length(C) == M %data is no less than M codes
n = 1;
fprintf('\n Checking the %dth %d codes ... \n',n,M);
seqPos0 = strfind(C(1:lMin-1)',targSeq);
if nargout == 2
if ~isempty(seqPos0)
seq = [seq;repmat(' ',lc*length(seqPos0),1)];
seqPos = seqPos0;
end
end
seqPos0 = strfind(C(-offset+1:end)',targSeq);
if ~isempty(seqPos0)
seqPos1 = repmat(seqPos0,lc,1);
seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
seqPos1 = seqPos1(:);
if sFlag == 1
fprintf(sfid,'%s',C(seqPos1));
end
if nargout >= 1
seq = [seq;C(seqPos1)];
if nargout >= 2
seqPos = [seqPos seqPos0-offset];
end
end
end
fprintf(' The %dth %d codes done. \n',n,M);
C0 = C(end-(lMin-1)+1:end);
while(1)
C = textscan(fid,'%c',M);
if isempty(C{1})
break
end
n = n+1;
fprintf('\n Checking the %dth %d codes ... \n',n,M);
C = [C0;C{1}];
seqPos0 = strfind(C(-offset+1:end)',targSeq);
if ~isempty(seqPos0)
seqPos1 = repmat(seqPos0,lc,1);
seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
seqPos1 = seqPos1(:);
if sFlag == 1
fprintf(sfid,'%s',C(seqPos1));
end
if nargout >= 1
seq = [seq;C(seqPos1)];
if nargout >= 2
seqPos = [seqPos seqPos0-offset+M*(n-1)-(lMin-1)];
end
end
end
fprintf(' The %dth %d codes done. \n',n,M);
C0 = C(end-(lMin-1)+1:end);
end
end
end
fprintf('\n Searching finished! \n',n,M);
fclose('all');
%———————————– seqSearch.m V2.3 —————————————
function [seq,seqPos] = seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)
%[seq,seqPos] = seqSearch2(f,nHL,targSeq,lc,offset,M,sFlag,fs)
%[seq,seqPos] = seqSearch2(f) with all default parameters.
%[seq,seqPos] = seqSearch2(f,'','','','',1e7) with all default parameters except M
%** If the dataset is very large, NOT using output is highly recoomended.
%
% Gene Sequence Search V2.3 - search for a specific sequence and record
% a neighboring code sequence with an offset position using matrix for
% large datasets.
%
%Input:
% f: data file path, format: 'filepath', e.g. 'C:\sample.seq'
% targSeq: search target sequence, by default 'TCC'
% lc: length of the codes to record, by default 4
% offset: number of offset positons from the search sequence to record the
% code. e.g., offset is 1 for the first one after the last code (to the
% right), and negative value means before the first code (to the left).
% by default 1
% M: length of the chunk of the codes to read into memory each time,
% by default 10E6
% sFlag: 1 for save, 0 for not save, by default 0
% fs: filename to save, only valid when sFlag = 1, e.g. 'seq.txt'
% nHL: number of headlines you want to remove from the data, by default 0
%
%Output:
% seq: the recorded codes
% seqPos: positions of the target search sequence
%
% Binlin Wu - CCNY
% bwu@sci.ccny.cuny.edu
% Jun 10th, 2011
% set default values for the parameters
switch nargin
case 0
error('Parameters needed: seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)')
case 1
nHL = 0; targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; sFlag = 0;
case 2
targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; sFlag = 0;
case 3
lc = 4; offset = 1; M = 1e6; sFlag = 0;
case 4
offset = 1; M = 1e6; sFlag = 0;
case 5
M = 1e6; sFlag = 0;
case 6
sFlag = 0;
case 7
if sFlag == 1
error('save destination file name missing');
end
case 8
if isequal(sFlag,1) && isempty(fs)
error('save destination file name missing');
end
otherwise
error('too many parameters: seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)')
end
if isempty(nHL)
nHL = 0;
end
if isempty(targSeq)
targSeq = 'TCC';
end
if isempty(lc)
lc = 4;
end
if isempty(offset)
offset = 1;
end
if isempty(M)
M = 1e6;
end
if isempty(sFlag)
sFlag = 0;
end
% ---- existing save file overwritten confirm ---
confirm = 0;
if sFlag == 1 && exist(fs,'file') == 2
while(1)
if confirm == 1
break
end
if sFlag == 1 && exist(fs,'file') == 2
fprintf('Filename %s already exists!\n',fs);
reply = upper(input('Do you want to overwrite it? Y/N [N]: ', 's'));
if isempty(reply)
reply = 'N';
end
if ~isequal(reply,'Y') && ~isequal(reply,'N')
confirm = 0;
elseif isequal(reply,'Y')
confirm = 1;
else
fs1 = input('Please enter a new file name: ', 's');
fprintf('Codes will be recorded in file: %s\n', fs1);
reply = upper(input('Confirm? Y/N [Y]: ', 's'));
if isempty(reply)
reply = 'Y';
end
if isequal(reply,'Y')
confirm = 1;
fs = fs1;
else
confirm = 0;
end
if exist(fs1,'file') == 2
confirm = 0;
end
end
end
end
end
% ----- initialization -------
lt = length(targSeq); %length of the search target sequence
fclose('all');
% f = '.\sample.seq';
fid = fopen(f,'r');
if sFlag == 1
sfid = fopen(fs,'w');
end
n = 0;
C0 = '';
seq = '';
seqPos = [];
% ----- start search -------
fprintf('Start searching ...\n')
if offset > 0
lMin = lt+offset+lc-1; %the least codes possible to have both a target sequence and the codes to record
if M < lMin
errMsg = sprintf('M must be larger than %d',lMin);
error(errMsg);
end
C = textscan(fid,'%c',M,'HeaderLines',nHL);
C = C{1};
if isempty(C) % && n == 1
fprintf('\n The data file is empty!\n')
return
elseif length(C) > 0 && length(C) < lMin %too short, length < lMin
if nargout == 2
seqPos0 = strfind(C',targSeq);
if ~isempty(seqPos0)
seq = [seq;repmat(' ',lc*length(seqPos0),1)];
seqPos = [seqPos seqPos0];
end
end
else
while(1)
if length(C) == 0
break
else
n = n+1;
fprintf('\n Checking the %dth %d codes ... \n',n,M);
LastChunkLength = length(C);
C = [C0;C];
seqPos0 = strfind(C(1:end - (lMin - 1) + (lt-1))',targSeq);
% seqPos0 = strfind(C(1:lengthC + lt-1)',targSeq); %not good for data length <=M
if ~isempty(seqPos0)
seqPos1 = repmat(seqPos0+lt-1+offset,lc,1);
seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
seqPos1 = seqPos1(:);
if sFlag == 1
fprintf(sfid,'%s',C(seqPos1));
end
if nargout >= 1
seq = [seq;C(seqPos1)];
if nargout >= 2
if n==1 %data length <= M
seqPos = seqPos0;
else %data length > M
% seqPos = [seqPos seqPos0+length(C)-(lMin-1)+(n-1)*M-(lMin-1)]; %not good for data length <=M
seqPos = [seqPos seqPos0+(n-1)*M-(lMin-1)];
end
end
end
end
fprintf(' The %dth %d codes done. \n',n,M);
C0 = C(end - (lMin - 1) +1:end);
% C0 = C(lengthC + 1:end) %not good for data length <=M
end
C = textscan(fid,'%c',M);
C = C{1};
end
if nargout == 2
seqPos0 = strfind(C0',targSeq);
if ~isempty(seqPos0)
seq = [seq;repmat(' ',lc*length(seqPos0),1)];
seqPos = [seqPos seqPos0+(n-1)*M+LastChunkLength-(lMin-1)];
end
end
end
else
lMin = lt-offset; %the least codes possible to have both a target sequence and the codes to record
if M < lMin
errMsg = sprintf('M must be larger than %d',lMin);
error(errMsg);
end
if -offset < lc
error(' |offset| can not be smaller than lc, when offset < 0');
end
C = textscan(fid,'%c',M,'HeaderLines',1);
C = C{1};
if isempty(C) % && n == 1
fprintf('\n The data file is empty!\n')
return
elseif length(C) > 0 && length(C) < lMin %too short, length < lMin
if nargout == 2
seqPos0 = strfind(C',targSeq);
if ~isempty(seqPos0)
seq = [seq;repmat(' ',lc*length(seqPos0),1)];
seqPos = seqPos0;
end
end
else
if nargout == 2
seqPos0 = strfind(C(1:lMin-1)',targSeq);
if ~isempty(seqPos0)
seq = [seq;repmat(' ',lc*length(seqPos0),1)];
seqPos = seqPos0;
end
end
while(1)
if length(C) == 0
break
else
n = n+1;
fprintf('\n Checking the %dth %d codes ... \n',n,M);
C = [C0;C];
seqPos0 = strfind(C(-offset+1:end)',targSeq);
if ~isempty(seqPos0)
seqPos1 = repmat(seqPos0,lc,1);
seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
seqPos1 = seqPos1(:);
if sFlag == 1
fprintf(sfid,'%s',C(seqPos1));
end
if nargout >= 1
seq = [seq;C(seqPos1)];
if nargout >= 2
if n == 1
seqPos = [seqPos seqPos0-offset];
else
seqPos = [seqPos seqPos0-offset+M*(n-1)-(lMin-1)];
end
end
end
end
fprintf(' The %dth %d codes done. \n',n,M);
C0 = C(end-(lMin-1)+1:end);
end
C = textscan(fid,'%c',M);
C=C{1};
end
end
end
fprintf('\n Searching finished! \n',n,M);
fclose('all');
Filed under: Computer, Matlab, Programming | Leave a comment »