save/write and load/read stack image files in Matlab

Write:

imageData = ones([100 100 10]);
tifFile = ‘ImageStack.tif’;
for i = 1:10
imwrite( imageData (:,:,i), tifFile,’WriteMode’,’append’);
end

Load:

tifFile = ‘ImageStack.tif’;
infoImage = imfinfo(tifFile);
w = InfoImage(1).Width;
h = InfoImage(1).Height;
planeNo = length(InfoImage);
imageData = zeros(h,w,planeNo,’uint16′);
for i = 1:planeNo
imageData(:,:,i) = imread(tifFile,’Index’,i);
end

Advertisements

Gene Sequence Search for Very Large Data File

A research fellow at Harvard asked me to write a program to search for gene sequence, such as ‘TCC’, and record the next 4 codes. The data file is 14Gb. He tried some matlab codes, and the system freezes, or keeps running and never stops.

I first tested using a loop method (V1.0). It turns out it will take a month to finish 14Gb data on my 1.8GHz Core 2 Duo/3Gb RAM PC. Then I updated it to use matrix. It turns out it will only take 1.3 hours on my 1.8Gb PC or 40 minutes on my 2.33GHz Core 2 Duo/2Gb RAM PC. It beats any codes that he got using Python or other language.

Source codes are shown as follows, where v1.0 is using loop and slow for large data file, it records wanted codes and target sequence positions; v2.0 and v2.1 record wanted codes only without positions; v2.2 and v2.3 records both codes and positions.

The latest version of the program is also posted on mathworks: http://www.mathworks.com/matlabcentral/fileexchange/31966-fast-gene-sequence-search-for-very-large-data-file

%———————————– seqSearch.m V1.0 —————————————

function [p,s]=seqSearch(f,targSeq,lc,offset,M,print)
%[p,s] = seqSearch(f,targSeq,lc,offset,M,print)
%[p,s] = seqSearch(f,'','','','','') with all default parameters.
%
%  Gene Sequence Search V1.0 - search for a specific sequence targSeq and record
%  a neighboring code sequence with an offset position.
%
%Input:
% f: data file path, format: 'filepath', e.g. 'C:\sample.seq'
% targSeq: searching target sequence, by default 'TCC'
% lc: length of the code to record, by default 4
% offset: number of offset positons from the search sequence to record the
%   code. e.g., offset is 1 for the first one after the last code (to the
%   right), and negative value means before the first code (to the left).
%   by default 1
% M: length of the chunk of the codes to read into memory each time,
%   by default 10E6
% print: display first one in every "print" number of found codes,
%   by default 50
%
%Output:
% p: positions of the target sequence found (first code of the sequence)
% s: the recorded codes
%
%   Binlin Wu - CCNY
%   bwu@sci.ccny.cuny.edu
%   Jun 10th, 2011

% targetSeq = 'TCC';
% M = 1e6;

switch nargin
    case 0
        error('Parameters needed: seqSearch(f,targSeq,lc,offset,M,print)')
    case 1
        targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; print = 100;
    case 2
        lc = 4; offset = 1; M = 1e6; print = 100;
    case 3
        offset = 1; M = 1e6; print = 100;
    case 4
        M = 1e6; print = 100;
    case 5
        print = 100;
    case 6
    otherwise
        error('too many parameters: seqSearch(f,targSeq,lc,offset,M,print)')
end

if isempty(targSeq)
    targSeq = 'TCC';
end
if isempty(lc)
    lc = 4;
end
if isempty(offset)
    offset = 1;
end
if isempty(M)
    M = 1e6;
end
if isempty(print)
    print = 100;
end

% if nargin < 5
%     error('More parameters needed, seqSearch(f,targSeq,lc,offset,M,...)')
% elseif	nargin==5
%     print = 50;
% end

lt = length(targSeq); %length of the search target sequence
fclose('all');
% f = 'C:\Documents and Settings\Bin\Desktop\sample1.txt';
fid = fopen(f,'r');

% ----- initialization -------
k = 0;
n = 0;
p = [];
s = '';
C0 = '';

fprintf('Start searching ...\n')
if offset > 0
    lMin = lt+offset+lc-1; %the least codes possible to have both a target sequence and the codes to record
    if M < lMin
        errMsg = sprintf('M must be larger than %d',lMin);
        error(errMsg);
    end
    C = textscan(fid,'%c',M,'HeaderLines',1);
    C = C{1};
    if length(C) > 0 && length(C) < lMin  %too short, length < lMin
        for i = 1:length(C)-lt+1
            if ~isequal(C(i:i+lt-1)',targSeq)
                continue
            end
            k = k+1;
            if print==1 || mod(k,print)==1
                fprintf('  - %dth %s sequence found\n',k,targSeq);
            end
            p(k) = i;
            s(k,:) = char(zeros(1,lc));
        end
    elseif length(C) >= lMin && length(C) < M  %only read once, lMin <= length < M
        i_END = length(C) - (lMin - 1);
        for i=1:i_END
            if ~isequal(C(i:i+lt-1)',targSeq)
                continue
            end
            k = k+1;
            if print==1 || mod(k,print)==1
                fprintf('  - %dth %s sequence found\n',k,targSeq);
            end
            p(k) = i;
            s(k,:) = C(i+lt-1+offset:i+lt-1+offset+lc-1);
        end
        C0 = C(i_END+1:end);
        for i = 1:length(C0)-lt+1
            if ~isequal(C0(i:i+lt-1)',targSeq)
                continue
            end
            k = k+1;
            if print==1 || mod(k,print)==1
                fprintf('  - %dth %s sequence found\n',k,targSeq);
            end
            p(k) = i_END+i;
            s(k,:) = char(zeros(1,lc));
        end
    elseif length(C) == M %data is no less than M codes
%         M = M+offset+lc-1;
        n = 1;
        fprintf('\n  In the %dth %d codes: \n',n,M);

        i_END = length(C) - (lMin - 1);
        for i=1:i_END
            if ~isequal(C(i:i+lt-1)',targSeq)
                continue
            end
            k = k+1;
            if print==1 || mod(k,print)==1
                fprintf('  - %dth %s sequence found\n',k,targSeq);
            end
            p(k) = i;
    %             if i+lt-1+offset+lc-1 > M && length(C) < M+length(C0) %run to the end of the data
    %                 s{k} = [];
    %             end
            s(k,:) = C(i+lt-1+offset:i+lt-1+offset+lc-1);

        end
        C0 = C(i_END+1:end);
        i_END = length(C);
        while(1)
            C = textscan(fid,'%c',M);
            if isempty(C{1})
                break
            end

            n = n+1;
            i_END = length(C{1});
            fprintf('\n  In the %dth %d codes: \n',n,M);
            C = [C0;C{1}];    

%             i_END = length(C) - (lMin - 1);

            for i=1:i_END
                if ~isequal(C(i:i+lt-1)',targSeq)
                    continue
                end
                k = k+1;
                if print==1 || mod(k,print)==1
                    fprintf('  - %dth %s sequence found\n',k,targSeq);
                end
                p(k)=(n-1)*M - (lMin - 1) + i;
        %             if i+lt-1+offset+lc-1 > M && length(C) < M+length(C0) %run to the end of the data
        %                 s{k} = [];
        %             end
                s(k,:) = C(i+lt-1+offset:i+lt-1+offset+lc-1);

            end
            C0 = C(i_END+1:end);
        end

        for i = 1:length(C0)-lt+1   % rest lMin-1 codes at the end
            if ~isequal(C0(i:i+lt-1)',targSeq)
                continue
            end
            k = k+1;
            if print==1 || mod(k,print)==1
                fprintf('  - %dth %s sequence found\n',k,targSeq);
            end
            if i_END == M % exactly n * M codes & target sequence found in the ending lMin-1 codes
                p(k)=n*M - (lMin - 1) + i;
            else
                p(k)=(n-1)*M - (lMin - 1) + i_END + i;
            end
            s(k,:) = char(zeros(1,lc));
        end
    end
else    % when offset is negtive
    lMin = lt-offset; %the least codes possible to have both a target sequence and the codes to record
    if M < lMin
        errMsg = sprintf('M must be larger than %d',lMin);
        error(errMsg);
    end
    if -offset < lc
        error(' |offset| can not be smaller than lc, when offset < 0');
    end
    C = textscan(fid,'%c',M,'HeaderLines',1);
    C = C{1};
    if length(C) > 0 && length(C) < lMin  %too short, length < lMin
        for i = 1:length(C)-lt+1
            if ~isequal(C(i:i+lt-1)',targSeq)
                continue
            end
            k = k+1;
            if print==1 || mod(k,print)==1
                fprintf('  - %dth %s sequence found\n',k,targSeq);
            end
            p(k) = i;
            s(k,:) = char(zeros(1,lc));
        end
    elseif length(C) >= lMin && length(C) < M  %only read once, lMin <= length < M
        for i = 1:(lMin-1)-lt+1
            if ~isequal(C(i:i+lt-1)',targSeq)
                continue
            end
            k = k+1;
            if print==1 || mod(k,print)==1
                fprintf('  - %dth %s sequence found\n',k,targSeq);
            end
            p(k) = i;
            s(k,:) = char(zeros(1,lc));
        end
        i_BEGIN = i+1; %lMin-lt+1;
        for i=i_BEGIN:length(C)-lt+1
            if ~isequal(C(i:i+lt-1)',targSeq)
                continue
            end
            k = k+1;
            if print==1 || mod(k,print)==1
                fprintf('  - %dth %s sequence found\n',k,targSeq);
            end
            p(k) = i;
            s(k,:) = C(i+offset : i+offset+lc-1);
        end
    elseif length(C) == M %data is no less than M codes
%         M = M+offset+lc-1;
        n = 1;
        fprintf('\n  In the %dth %d codes: \n',n,M);

        i_END = lMin -  lt; %(lMin-1)-lt+1 ;
        for i = 1:i_END %(lMin-1)-lt+1
            if ~isequal(C(i:i+lt-1)',targSeq)
                continue
            end
            k = k+1;
            if print==1 || mod(k,print)==1
                fprintf('  - %dth %s sequence found\n',k,targSeq);
            end
            p(k) = i;
            s(k,:) = char(zeros(1,lc));
        end

        i_BEGIN = i_END + 1; %lMin-lt+1;
        i_END = M - lt + 1;

        for i=i_BEGIN:i_END %length(C)-lt+1
            if ~isequal(C(i:i+lt-1)',targSeq)
                continue
            end
            k = k+1;
            if print==1 || mod(k,print)==1
                fprintf('  - %dth %s sequence found\n',k,targSeq);
            end
            p(k) = i;
            s(k,:) = C(i+offset:i+offset+lc-1);
        end
        C0 = C(end-(lMin-1)+1:end);
        while(1)
            C = textscan(fid,'%c',M);
            if isempty(C{1})
                break
            end

            n = n+1;
            fprintf('\n  In the %dth %d codes: \n',n,M);
            C = [C0;C{1}];

            i_BEGIN = lMin-lt+1;
            i_END = length(C)-lt+1;
            for i=i_BEGIN:i_END
                if ~isequal(C(i:i+lt-1)',targSeq)
                    continue
                end
                k = k+1;
                if print==1 || mod(k,print)==1
                    fprintf('  - %dth %s sequence found\n',k,targSeq);
                end
                p(k)=(n-1)*M - (lMin - 1) + i;
                s(k,:) = C(i+offset:i+offset+lc-1);
            end
            C0 = C(end-(lMin-1)+1:end);
        end
    end
end
fprintf('\n  Searching finished! \n',n,M);
fclose(fid);
save seqSearch.mat p s

%———————————– seqSearch.m V2.0 —————————————

function seq1 = seqSearch(f,targSeq,lc,offset,M,sFlag,fs)
%seq1 = seqSearch2(f,targSeq,lc,offset,M,sFlag,fs)
%seq1 = seqSearch2(f) with all default parameters.
%seq1 = seqSearch2(f,'','','',1e7) with all default parameters except M
%** If the dataset is very large, NOT using output is highly recoomended.
%
%  Gene Sequence Search V2.0 - search for a specific sequence and record
%  a neighboring code sequence with an offset position using matrix for
%  large datasets.
%
%Input:
% f: data file path, format: 'filepath', e.g. 'C:\sample.seq'
% targSeq: search target sequence, by default 'TCC'
% lc: length of the codes to record, by default 4
% offset: number of offset positons from the search sequence to record the
%   code. e.g., offset is 1 for the first one after the last code (to the
%   right), and negative value means before the first code (to the left).
%   by default 1
% M: length of the chunk of the codes to read into memory each time,
%   by default 10E6
% sFlag: 1 for save, 0 for not save, by default 0
% fs: filename to save, only valid when sFlag = 1, e.g. 'seq.txt'
%
%Output:
% seq1: the recorded codes
%
%   Binlin Wu - CCNY
%   bwu@sci.ccny.cuny.edu
%   Jun 10th, 2011

% set default values for the parameters
switch nargin
    case 0
        error('Parameters needed: seqSearch(f,targSeq,lc,offset,M,print)')
    case 1
        targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; sFlag = 0;
    case 2
        lc = 4; offset = 1; M = 1e6; sFlag = 0;
    case 3
        offset = 1; M = 1e6; sFlag = 0;
    case 4
        M = 1e6; sFlag = 0;
    case 5
        sFlag = 0;
    case 6
        if sFlag == 1
            error('save destination file name missing');
        end
    case 7
        if isequal(sFlag,1) && isempty(fs)
            error('save destination file name missing');
        end
    otherwise
        error('too many parameters: seqSearch(f,targSeq,lc,offset,M,print)')
end

if isempty(targSeq)
    targSeq = 'TCC';
end
if isempty(lc)
    lc = 4;
end
if isempty(offset)
    offset = 1;
end
if isempty(M)
    M = 1e6;
end
if isempty(sFlag)
    sFlag = 0;
end

% ---- existing save file overwritten confirm ---
confirm = 0;
if sFlag == 1 && exist(fs,'file') == 2
    while(1)
        if confirm == 1
            break
        end
        if sFlag == 1 && exist(fs,'file') == 2
            fprintf('Filename %s already exists!\n',fs);
            reply = upper(input('Do you want to overwrite it? Y/N [N]: ', 's'));
            if isempty(reply)
                reply = 'N';
            end
            if ~isequal(reply,'Y') && ~isequal(reply,'N')
                confirm = 0;
            elseif isequal(reply,'Y')
                confirm = 1;
            else
                fs1 = input('Please enter a new file name: ', 's');
                fprintf('Codes will be recorded in file: %s\n', fs1);
                reply = upper(input('Confirm? Y/N [Y]: ', 's'));
                if isempty(reply)
                    reply = 'Y';
                end
                if isequal(reply,'Y')
                    confirm = 1;
                    fs = fs1;
                else
                    confirm = 0;
                end
                if exist(fs1,'file') == 2
                    confirm = 0;
                end
            end
        end
    end
end
% --------------------------------------

lt = length(targSeq); %length of the search target sequence
fclose('all');
% f = '.\sample.seq';
fid = fopen(f,'r');
if sFlag == 1
    sfid = fopen(fs,'w');
end

% ----- initialization -------
n = 0;
C0 = '';
seq1 = '';

fprintf('Start searching ...\n')
lMin = lt+offset+lc-1; %the least codes possible to have both a target sequence and the codes to record
if M < lMin
    errMsg = sprintf('M must be larger than %d',lMin);
    error(errMsg);
end
C = textscan(fid,'%c',0,'HeaderLines',1);

while(1)
    C = textscan(fid,'%c',M);
    if isempty(C{1})
        break
    end

    n = n+1;
    fprintf('\n  Checking the %dth %d codes ... \n',n,M);
    C = [C0;C{1}];    

    i_END = length(C) - lMin + lt;

    C1 = [C(1:i_END) C(2:i_END+1) C(3:i_END+2)];
    SEQ = repmat('TCC',i_END,1);
    seqPos = C1 == SEQ;
    seqPos = seqPos(:,1)&seqPos(:,2)&seqPos(:,3);
    seqPos1 = logical( ...
              [0; 0; 0; seqPos(1:end);  0] + ...
              [0; 0; 0; 0;             seqPos(1:end)] + ...
              [0; 0; 0; 0;             0;               seqPos(1:end-1)] + ...
              [0; 0; 0; 0;             0;               0;               seqPos(1:end-2)]);

    if sFlag == 1
        fprintf(sfid,'%s',C(seqPos1));
    end
    if nargout == 1
        seq1 = [seq1;C(seqPos1)];
    end
    fprintf('  The %dth %d codes done. \n',n,M);
    C0 = C(end - (lMin - 1) + 1:end);
end
fprintf('\n  Searching finished! \n',n,M);
fclose('all');

%———————————– seqSearch.m V2.1 —————————————

function seq1 = seqSearch(f,targSeq,lc,offset,M,sFlag,fs)
%seq1 = seqSearch2(f,targSeq,lc,offset,M,sFlag,fs)
%seq1 = seqSearch2(f) with all default parameters.
%seq1 = seqSearch2(f,'','','',1e7) with all default parameters except M
%** If the dataset is very large, NOT using output is highly recoomended.
%
%  Gene Sequence Search V2.1 - search for a specific sequence and record
%  a neighboring code sequence with an offset position using matrix for
%  large datasets.
%
%Input:
% f: data file path, format: 'filepath', e.g. 'C:\sample.seq'
% targSeq: search target sequence, by default 'TCC'
% lc: length of the codes to record, by default 4
% offset: number of offset positons from the search sequence to record the
%   code. e.g., offset is 1 for the first one after the last code (to the
%   right), and negative value means before the first code (to the left).
%   by default 1
% M: length of the chunk of the codes to read into memory each time,
%   by default 10E6
% sFlag: 1 for save, 0 for not save, by default 0
% fs: filename to save, only valid when sFlag = 1, e.g. 'seq.txt'
%
%Output:
% seq1: the recorded codes
%
%   Binlin Wu - CCNY
%   bwu@sci.ccny.cuny.edu
%   Jun 10th, 2011

% set default values for the parameters
switch nargin
    case 0
        error('Parameters needed: seqSearch(f,targSeq,lc,offset,M,print)')
    case 1
        targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; sFlag = 0;
    case 2
        lc = 4; offset = 1; M = 1e6; sFlag = 0;
    case 3
        offset = 1; M = 1e6; sFlag = 0;
    case 4
        M = 1e6; sFlag = 0;
    case 5
        sFlag = 0;
    case 6
        if sFlag == 1
            error('save destination file name missing');
        end
    case 7
        if isequal(sFlag,1) && isempty(fs)
            error('save destination file name missing');
        end
    otherwise
        error('too many parameters: seqSearch(f,targSeq,lc,offset,M,print)')
end

if isempty(targSeq)
    targSeq = 'TCC';
end
if isempty(lc)
    lc = 4;
end
if isempty(offset)
    offset = 1;
end
if isempty(M)
    M = 1e6;
end
if isempty(sFlag)
    sFlag = 0;
end

% ---- existing save file overwritten confirm ---
confirm = 0;
if sFlag == 1 && exist(fs,'file') == 2
    while(1)
        if confirm == 1
            break
        end
        if sFlag == 1 && exist(fs,'file') == 2
            fprintf('Filename %s already exists!\n',fs);
            reply = upper(input('Do you want to overwrite it? Y/N [N]: ', 's'));
            if isempty(reply)
                reply = 'N';
            end
            if ~isequal(reply,'Y') && ~isequal(reply,'N')
                confirm = 0;
            elseif isequal(reply,'Y')
                confirm = 1;
            else
                fs1 = input('Please enter a new file name: ', 's');
                fprintf('Codes will be recorded in file: %s\n', fs1);
                reply = upper(input('Confirm? Y/N [Y]: ', 's'));
                if isempty(reply)
                    reply = 'Y';
                end
                if isequal(reply,'Y')
                    confirm = 1;
                    fs = fs1;
                else
                    confirm = 0;
                end
                if exist(fs1,'file') == 2
                    confirm = 0;
                end
            end
        end
    end
end
% --------------------------------------

lt = length(targSeq); %length of the search target sequence
fclose('all');
% f = '.\sample.seq';
fid = fopen(f,'r');
if sFlag == 1
    sfid = fopen(fs,'w');
end

% ----- initialization -------
n = 0;
C0 = '';
seq1 = '';

fprintf('Start searching ...\n')
lMin = lt+offset+lc-1; %the least codes possible to have both a target sequence and the codes to record
if M < lMin
    errMsg = sprintf('M must be larger than %d',lMin);
    error(errMsg);
end
C = textscan(fid,'%c',0,'HeaderLines',1);

while(1)
    C = textscan(fid,'%c',M);
    if isempty(C{1})
        break
    end

    n = n+1;
    fprintf('\n  Checking the %dth %d codes ... \n',n,M);
    C = [C0;C{1}];    

%     i_END = length(C) - lMin + lt;
%
%     C1 = [C(1:i_END) C(2:i_END+1) C(3:i_END+2)];
%     SEQ = repmat('TCC',i_END,1);
%     seqPos = C1 == SEQ;
%     seqPos = seqPos(:,1)&seqPos(:,2)&seqPos(:,3);
%     seqPos1 = logical( ...
%               [0; 0; 0; seqPos(1:end);  0] + ...
%               [0; 0; 0; 0;             seqPos(1:end)] + ...
%               [0; 0; 0; 0;             0;               seqPos(1:end-1)] + ...
%               [0; 0; 0; 0;             0;               0;               seqPos(1:end-2)]);

%     seqPos = strfind(C(1:i_END)','TCC');
%     seqPos1 = [seqPos+lt+offset-1;seqPos+lt+offset;seqPos+lt+offset+1;seqPos+lt+offset+2];

    seqPos = strfind(C(1:end-lMin + lt)',targSeq);
    if ~isempty(seqPos)
        seqPos1 = repmat(seqPos+lt+offset-1,lc,1);
        seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos));

        seqPos1 = seqPos1(:);

        if sFlag == 1
            fprintf(sfid,'%s',C(seqPos1));
        end
        if nargout == 1
            seq1 = [seq1;C(seqPos1)];
        end
    end
    fprintf('  The %dth %d codes done. \n',n,M);
    C0 = C(end - (lMin - 1) + 1:end);
end
fprintf('\n  Searching finished! \n',n,M);
fclose('all');

%———————————– seqSearch.m V2.2 —————————————

function [seq,seqPos] = seqSearch(f,targSeq,lc,offset,M,sFlag,fs)
%[seq,seqPos] = seqSearch2(f,targSeq,lc,offset,M,sFlag,fs)
%[seq,seqPos] = seqSearch2(f) with all default parameters.
%[seq,seqPos] = seqSearch2(f,'','','',1e7) with all default parameters except M
%** If the dataset is very large, NOT using output is highly recoomended.
%
%  Gene Sequence Search V2.2 - search for a specific sequence and record
%  a neighboring code sequence with an offset position using matrix for
%  large datasets.
%
%Input:
% f: data file path, format: 'filepath', e.g. 'C:\sample.seq'
% targSeq: search target sequence, by default 'TCC'
% lc: length of the codes to record, by default 4
% offset: number of offset positons from the search sequence to record the
%   code. e.g., offset is 1 for the first one after the last code (to the
%   right), and negative value means before the first code (to the left).
%   by default 1
% M: length of the chunk of the codes to read into memory each time,
%   by default 10E6
% sFlag: 1 for save, 0 for not save, by default 0
% fs: filename to save, only valid when sFlag = 1, e.g. 'seq.txt'
%
%Output:
% seq: the recorded codes
% seqPos: positions of the target search sequence
%
%   Binlin Wu - CCNY
%   bwu@sci.ccny.cuny.edu
%   Jun 10th, 2011

% set default values for the parameters
switch nargin
    case 0
        error('Parameters needed: seqSearch(f,targSeq,lc,offset,M,print)')
    case 1
        targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; sFlag = 0;
    case 2
        lc = 4; offset = 1; M = 1e6; sFlag = 0;
    case 3
        offset = 1; M = 1e6; sFlag = 0;
    case 4
        M = 1e6; sFlag = 0;
    case 5
        sFlag = 0;
    case 6
        if sFlag == 1
            error('save destination file name missing');
        end
    case 7
        if sFlag == 1 && isempty(fs)
            error('save destination file name missing');
        end
    otherwise
        error('too many parameters: seqSearch(f,targSeq,lc,offset,M,print)')
end

if isempty(targSeq)
    targSeq = 'TCC';
end
if isempty(lc)
    lc = 4;
end
if isempty(offset)
    offset = 1;
end
if isempty(M)
    M = 1e6;
end
if isempty(sFlag)
    sFlag = 0;
end

% ---- existing save file overwritten confirm ---
confirm = 0;
if sFlag == 1 && exist(fs,'file') == 2
    while(1)
        if confirm == 1
            break
        end
        if sFlag == 1 && exist(fs,'file') == 2
            fprintf('Filename %s already exists!\n',fs);
            reply = upper(input('Do you want to overwrite it? Y/N [N]: ', 's'));
            if isempty(reply)
                reply = 'N';
            end
            if ~isequal(reply,'Y') && ~isequal(reply,'N')
                confirm = 0;
            elseif isequal(reply,'Y')
                confirm = 1;
            else
                fs1 = input('Please enter a new file name: ', 's');
                fprintf('Codes will be recorded in file: %s\n', fs1);
                reply = upper(input('Confirm? Y/N [Y]: ', 's'));
                if isempty(reply)
                    reply = 'Y';
                end
                if isequal(reply,'Y')
                    confirm = 1;
                    fs = fs1;
                else
                    confirm = 0;
                end
                if exist(fs1,'file') == 2
                    confirm = 0;
                end
            end
        end
    end
end

% ----- initialization -------

lt = length(targSeq); %length of the search target sequence
fclose('all');
% f = '.\sample.seq';
fid = fopen(f,'r');
if sFlag == 1
    sfid = fopen(fs,'w');
end

n = 0;
C0 = '';
seq = '';
seqPos = [];

% ----- start search -------

fprintf('Start searching ...\n')
if offset > 0
    lMin = lt+offset+lc-1; %the least codes possible to have both a target sequence and the codes to record
    if M < lMin
        errMsg = sprintf('M must be larger than %d',lMin);
        error(errMsg);
    end

    C = textscan(fid,'%c',M,'HeaderLines',1);
    C = C{1};

    if isempty(C)
        fprintf('\n  The data file is empty!\n')
        return
    end

    if length(C) > 0 && length(C) < lMin  %too short, length < lMin
        if nargout == 2
            seqPos0 = strfind(C',targSeq);
            if ~isempty(seqPos0)
                seq = [seq;repmat(' ',lc*length(seqPos0),1)];
                seqPos = [seqPos seqPos0];
            end
        end

    elseif length(C) >= lMin && length(C) < M  %only read once, lMin <= length < M

        seqPos0 = strfind(C(1:end - (lMin - 1) + (lt-1))',targSeq);

        if ~isempty(seqPos0)
            seqPos1 = repmat(seqPos0+lt-1+offset,lc,1);
            seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
            seqPos1 = seqPos1(:);

            if sFlag == 1
                fprintf(sfid,'%s',C(seqPos1));
            end
            if nargout >= 1
                seq = [seq;C(seqPos1)];
            end
            if nargout >= 2
                seqPos = [seqPos seqPos0];
            end
        end             

        seqPos0 = strfind(C(end - (lMin - 1) + 1:end)',targSeq);
        if ~isempty(seqPos0)
            if nargout >= 1
                seq = [seq;repmat(' ',lc*length(seqPos0),1)];
            end
            if nargout >= 2
                seqPos = [seqPos seqPos0 + length(C) - (lMin - 1)];
            end
        end

    elseif length(C) == M %data is no less than M codes
        n=1;
        LastChunkLength = length(C);
        fprintf('\n  Checking the %dth %d codes ... \n',n,M);

        seqPos0 = strfind(C(1:end - (lMin - 1) + (lt-1))',targSeq);

        if ~isempty(seqPos0)
            seqPos1 = repmat(seqPos0+lt-1+offset,lc,1);
            seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
            seqPos1 = seqPos1(:);

            if sFlag == 1
                fprintf(sfid,'%s',C(seqPos1));
            end
            if nargout >= 1
                seq = [seq;C(seqPos1)];
            end
            if nargout >= 2
                seqPos = [seqPos seqPos0];
            end
        end
        fprintf('  The %dth %d codes done. \n',n,M);
        C0 = C(end - (lMin - 1) + 1:end);                 

        while(1)
            C = textscan(fid,'%c',M);
            if isempty(C{1})
                break
            end
            LastChunkLength = length(C{1});
            n = n+1;
            fprintf('\n  Checking the %dth %d codes ... \n',n,M);
            C = [C0;C{1}];    

        %     C1 = [C(1:i_END) C(2:i_END+1) C(3:i_END+2)];
        %     SEQ = repmat('TCC',i_END,1);
        %     seqPos = C1 == SEQ;
        %     seqPos = seqPos(:,1)&seqPos(:,2)&seqPos(:,3);
        %     seqPos1 = logical( ...
        %               [0; 0; 0; seqPos(1:end); 0;               0;               0            ] + ...
        %               [0; 0; 0; 0;             seqPos(1:end);   0;               0            ] + ...
        %               [0; 0; 0; 0;             0;               seqPos(1:end);   0            ] + ...
        %               [0; 0; 0; 0;             0;               0;
        %               seqPos(1:end)] );

    %         seqPos = strfind(C(1:i_END)','TCC');
    %         seqPos1 = [seqPos+lt+offset-1;seqPos+lt+offset;seqPos+lt+offset+1;seqPos+lt+offset+2];

            seqPos0 = strfind(C(1:end - (lMin - 1) + (lt-1))',targSeq);
            if ~isempty(seqPos0)
                seqPos1 = repmat(seqPos0+lt-1+offset,lc,1);
                seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
                seqPos1 = seqPos1(:);

                if sFlag == 1
                    fprintf(sfid,'%s',C(seqPos1));
                end
                if nargout >= 1
                    seq = [seq;C(seqPos1)];
                    if nargout >= 2
                        seqPos = [seqPos seqPos0+(n-1)*M-(lMin-1)];
                    end
                end
            end
            fprintf('  The %dth %d codes done. \n',n,M);
            C0 = C(end - (lMin - 1) + 1:end);
        end
        if nargout == 2
            seqPos0 = strfind(C0',targSeq);
            if ~isempty(seqPos0)
                seq = [seq;repmat(' ',lc*length(seqPos0),1)];
                seqPos = [seqPos seqPos0+LastChunkLength+(n-1)*M-(lMin-1)];
            end
        end
    end 

else
    lMin = lt-offset; %the least codes possible to have both a target sequence and the codes to record
    if M < lMin
        errMsg = sprintf('M must be larger than %d',lMin);
        error(errMsg);
    end
    if -offset < lc
        error(' |offset| can not be smaller than lc, when offset < 0');
    end

    C = textscan(fid,'%c',M,'HeaderLines',1);
    C = C{1};

    if isempty(C)
        fprintf('\n  The data file is empty!\n')
        return
    end    

    if length(C) > 0 && length(C) < lMin  %too short, length < lMin

        seqPos0 = strfind(C',targSeq);
        if nargout == 2
            if ~isempty(seqPos0)
                seq = [seq;repmat(' ',lc*length(seqPos0),1)];
                seqPos = seqPos0;
            end
        end

    elseif length(C) >= lMin && length(C) < M  %only read once, lMin <= length < M

        seqPos0 = strfind(C(1:lMin-1)',targSeq);
        if nargout == 2
            if ~isempty(seqPos0)
                seq = [seq;repmat(' ',lc*length(seqPos0),1)];
                seqPos = seqPos0;
            end
        end     

        seqPos0 = strfind(C(-offset+1:end)',targSeq);

        if ~isempty(seqPos0)
            seqPos1 = repmat(seqPos0,lc,1);
            seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
            seqPos1 = seqPos1(:);

            if sFlag == 1
                fprintf(sfid,'%s',C(seqPos1));
            end
            if nargout >= 1
                seq = [seq;C(seqPos1)];
                if nargout >= 2
                    seqPos = [seqPos seqPos0-offset];
                end
            end
        end
    elseif length(C) == M %data is no less than M codes
        n = 1;
        fprintf('\n  Checking the %dth %d codes ... \n',n,M);

        seqPos0 = strfind(C(1:lMin-1)',targSeq);
        if nargout == 2
            if ~isempty(seqPos0)
                seq = [seq;repmat(' ',lc*length(seqPos0),1)];
                seqPos = seqPos0;
            end
        end         

        seqPos0 = strfind(C(-offset+1:end)',targSeq);
        if ~isempty(seqPos0)
            seqPos1 = repmat(seqPos0,lc,1);
            seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
            seqPos1 = seqPos1(:);

            if sFlag == 1
                fprintf(sfid,'%s',C(seqPos1));
            end
            if nargout >= 1
                seq = [seq;C(seqPos1)];
                if nargout >= 2
                    seqPos = [seqPos seqPos0-offset];
                end
            end
        end
        fprintf('  The %dth %d codes done. \n',n,M);
        C0 = C(end-(lMin-1)+1:end);

        while(1)
            C = textscan(fid,'%c',M);
            if isempty(C{1})
                break
            end

            n = n+1;
            fprintf('\n  Checking the %dth %d codes ... \n',n,M);
            C = [C0;C{1}];    

            seqPos0 = strfind(C(-offset+1:end)',targSeq);

            if ~isempty(seqPos0)
                seqPos1 = repmat(seqPos0,lc,1);
                seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
                seqPos1 = seqPos1(:);
                if sFlag == 1
                    fprintf(sfid,'%s',C(seqPos1));
                end
                if nargout >= 1
                    seq = [seq;C(seqPos1)];
                    if nargout >= 2
                        seqPos = [seqPos seqPos0-offset+M*(n-1)-(lMin-1)];
                    end
                end
            end
            fprintf('  The %dth %d codes done. \n',n,M);
            C0 = C(end-(lMin-1)+1:end);
        end

    end
end

fprintf('\n  Searching finished! \n',n,M);
fclose('all');

%———————————– seqSearch.m V2.3 —————————————

function [seq,seqPos] = seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)
%[seq,seqPos] = seqSearch2(f,nHL,targSeq,lc,offset,M,sFlag,fs)
%[seq,seqPos] = seqSearch2(f) with all default parameters.
%[seq,seqPos] = seqSearch2(f,'','','','',1e7) with all default parameters except M
%** If the dataset is very large, NOT using output is highly recoomended.
%
%  Gene Sequence Search V2.3 - search for a specific sequence and record
%  a neighboring code sequence with an offset position using matrix for
%  large datasets.
%
%Input:
% f: data file path, format: 'filepath', e.g. 'C:\sample.seq'
% targSeq: search target sequence, by default 'TCC'
% lc: length of the codes to record, by default 4
% offset: number of offset positons from the search sequence to record the
%   code. e.g., offset is 1 for the first one after the last code (to the
%   right), and negative value means before the first code (to the left).
%   by default 1
% M: length of the chunk of the codes to read into memory each time,
%   by default 10E6
% sFlag: 1 for save, 0 for not save, by default 0
% fs: filename to save, only valid when sFlag = 1, e.g. 'seq.txt'
% nHL: number of headlines you want to remove from the data, by default 0
%
%Output:
% seq: the recorded codes
% seqPos: positions of the target search sequence
%
%   Binlin Wu - CCNY
%   bwu@sci.ccny.cuny.edu
%   Jun 10th, 2011

% set default values for the parameters
switch nargin
    case 0
        error('Parameters needed: seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)')
    case 1
        nHL = 0; targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; sFlag = 0;
    case 2
        targSeq = 'TCC'; lc = 4; offset = 1; M = 1e6; sFlag = 0;
    case 3
        lc = 4; offset = 1; M = 1e6; sFlag = 0;
    case 4
        offset = 1; M = 1e6; sFlag = 0;
    case 5
        M = 1e6; sFlag = 0;
    case 6
        sFlag = 0;
    case 7
        if sFlag == 1
            error('save destination file name missing');
        end
    case 8
        if isequal(sFlag,1) && isempty(fs)
            error('save destination file name missing');
        end
    otherwise
        error('too many parameters: seqSearch(f,nHL,targSeq,lc,offset,M,sFlag,fs)')
end

if isempty(nHL)
    nHL = 0;
end
if isempty(targSeq)
    targSeq = 'TCC';
end
if isempty(lc)
    lc = 4;
end
if isempty(offset)
    offset = 1;
end
if isempty(M)
    M = 1e6;
end
if isempty(sFlag)
    sFlag = 0;
end

% ---- existing save file overwritten confirm ---
confirm = 0;
if sFlag == 1 && exist(fs,'file') == 2
    while(1)
        if confirm == 1
            break
        end
        if sFlag == 1 && exist(fs,'file') == 2
            fprintf('Filename %s already exists!\n',fs);
            reply = upper(input('Do you want to overwrite it? Y/N [N]: ', 's'));
            if isempty(reply)
                reply = 'N';
            end
            if ~isequal(reply,'Y') && ~isequal(reply,'N')
                confirm = 0;
            elseif isequal(reply,'Y')
                confirm = 1;
            else
                fs1 = input('Please enter a new file name: ', 's');
                fprintf('Codes will be recorded in file: %s\n', fs1);
                reply = upper(input('Confirm? Y/N [Y]: ', 's'));
                if isempty(reply)
                    reply = 'Y';
                end
                if isequal(reply,'Y')
                    confirm = 1;
                    fs = fs1;
                else
                    confirm = 0;
                end
                if exist(fs1,'file') == 2
                    confirm = 0;
                end
            end
        end
    end
end

% ----- initialization -------

lt = length(targSeq); %length of the search target sequence
fclose('all');
% f = '.\sample.seq';
fid = fopen(f,'r');
if sFlag == 1
    sfid = fopen(fs,'w');
end

n = 0;
C0 = '';
seq = '';
seqPos = [];

% ----- start search -------

fprintf('Start searching ...\n')
if offset > 0
    lMin = lt+offset+lc-1; %the least codes possible to have both a target sequence and the codes to record
    if M < lMin
        errMsg = sprintf('M must be larger than %d',lMin);
        error(errMsg);
    end

    C = textscan(fid,'%c',M,'HeaderLines',nHL);
    C = C{1};

    if isempty(C) % && n == 1
        fprintf('\n  The data file is empty!\n')
        return

    elseif length(C) > 0 && length(C) < lMin  %too short, length < lMin
        if nargout == 2
            seqPos0 = strfind(C',targSeq);
            if ~isempty(seqPos0)
                seq = [seq;repmat(' ',lc*length(seqPos0),1)];
                seqPos = [seqPos seqPos0];
            end
        end

    else
        while(1)
            if length(C) == 0
                break
            else
                n = n+1;
                fprintf('\n  Checking the %dth %d codes ... \n',n,M);
                LastChunkLength = length(C);
                C = [C0;C];
                seqPos0 = strfind(C(1:end - (lMin - 1) + (lt-1))',targSeq);
%                 seqPos0 = strfind(C(1:lengthC + lt-1)',targSeq); %not good for data length <=M
                if ~isempty(seqPos0)
                    seqPos1 = repmat(seqPos0+lt-1+offset,lc,1);
                    seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
                    seqPos1 = seqPos1(:);

                    if sFlag == 1
                        fprintf(sfid,'%s',C(seqPos1));
                    end
                    if nargout >= 1
                        seq = [seq;C(seqPos1)];
                        if nargout >= 2
                            if n==1 %data length <= M
                                seqPos = seqPos0;
                            else %data length > M
%                                 seqPos = [seqPos seqPos0+length(C)-(lMin-1)+(n-1)*M-(lMin-1)];  %not good for data length <=M
                                seqPos = [seqPos seqPos0+(n-1)*M-(lMin-1)];
                            end
                        end
                    end
                end
                fprintf('  The %dth %d codes done. \n',n,M);
                C0 = C(end - (lMin - 1) +1:end);
%                 C0 = C(lengthC + 1:end)  %not good for data length <=M
            end
            C = textscan(fid,'%c',M);
            C = C{1};
        end

        if nargout == 2
            seqPos0 = strfind(C0',targSeq);
            if ~isempty(seqPos0)
                seq = [seq;repmat(' ',lc*length(seqPos0),1)];
                seqPos = [seqPos seqPos0+(n-1)*M+LastChunkLength-(lMin-1)];
            end
        end
    end
else
    lMin = lt-offset; %the least codes possible to have both a target sequence and the codes to record
    if M < lMin
        errMsg = sprintf('M must be larger than %d',lMin);
        error(errMsg);
    end
    if -offset < lc
        error(' |offset| can not be smaller than lc, when offset < 0');
    end

    C = textscan(fid,'%c',M,'HeaderLines',1);
    C = C{1};

    if isempty(C) % && n == 1
        fprintf('\n  The data file is empty!\n')
        return

    elseif length(C) > 0 && length(C) < lMin  %too short, length < lMin
        if nargout == 2
            seqPos0 = strfind(C',targSeq);
            if ~isempty(seqPos0)
                seq = [seq;repmat(' ',lc*length(seqPos0),1)];
                seqPos = seqPos0;
            end
        end

    else
        if nargout == 2
            seqPos0 = strfind(C(1:lMin-1)',targSeq);
            if ~isempty(seqPos0)
                seq = [seq;repmat(' ',lc*length(seqPos0),1)];
                seqPos = seqPos0;
            end
        end
        while(1)
            if length(C) == 0
                break
            else
                n = n+1;
                fprintf('\n  Checking the %dth %d codes ... \n',n,M);
                C = [C0;C];
                seqPos0 = strfind(C(-offset+1:end)',targSeq);

                if ~isempty(seqPos0)
                    seqPos1 = repmat(seqPos0,lc,1);
                    seqPos1 = seqPos1 + repmat([0:lc-1]',1,length(seqPos0));
                    seqPos1 = seqPos1(:);

                    if sFlag == 1
                        fprintf(sfid,'%s',C(seqPos1));
                    end
                    if nargout >= 1
                        seq = [seq;C(seqPos1)];
                        if nargout >= 2
                            if n == 1
                                seqPos = [seqPos seqPos0-offset];
                            else
                                seqPos = [seqPos seqPos0-offset+M*(n-1)-(lMin-1)];
                            end
                        end
                    end
                end
                fprintf('  The %dth %d codes done. \n',n,M);
                C0 = C(end-(lMin-1)+1:end);
            end
            C = textscan(fid,'%c',M);
            C=C{1};
        end
    end
end

fprintf('\n  Searching finished! \n',n,M);
fclose('all');

Run two Matlab functions simutaneously in parallel

My friend gave me a project: run two matlab loops (can be put in two functions) simultaneously. They have to be synchronized. They have to start at the same time, and they have to synchronize every step in loop. The problem was simplified because the two loops with run with a fairly stable step size. So what I need do is to start the two functions at the same time with error less than 0.1ms.

To run two matlab functions, we need parallel computing, or simply use two matlab sessions. For two matlab sessions, we need make a feedback in one session to trigger the other session. There are different ways to make the trigger.

1. Use hard drive to communite between two session is one way. Basically, we need first initialize a start.ini file with -1:

fid=fopen('start.ini','w');
fprintf(fid,'%d',-1);
fclose(fid);

Then one session start function 1 and write 1 to the ini file:

fid=fopen('start.ini','w');
fprintf(fid,'%d',1);
fclose(fid);

In the other session, it keeps checking the ini file, it starts once it reads 1.

while 1
    fid=fopen('start.ini','r');
    k=fscanf(fid,'%d',1);
    fclose(fid);
    if k==1
    %-------- reset start.ini to -1 start ---------------%
        fid=fopen('start.ini','w');
        fprintf(fid,'%d',-1);
        fclose(fid);
    %-------- reset start.ini to -1 end ---------------%
        break
    end
end

This works in a lot of scenarios. Here the problem with this is it takes a few milliseconds, and it fluctuates. So this is not a good solution.

2. Use a shared memory. Ram should be faster than Hard Drive. Basically, we can have one session write a variable to the memory, and have the other session read it, just like writing and reading from hard drive, but faster. The problem is two matlab sessions use two separate workspaces residing in different memory locations, not shared.

Actually, if we could, creating a shared variable in CPU cache (register) is the fastest way to read and write a variable for a computer. But I can’t find anything in matlab about that.

There is method called DDE (Dynamic Data Exchange) that makes different applications communicate.  But I didn’t find anything that can make DDE work between two matlab session.

Then I found a matlab program sharedmatrix on mathworks: http://www.mathworks.com/matlabcentral/fileexchange/28572-sharedmatrix. This program creates a memory block that will be shared by two matlab sessions. I managed to compile it in Matlab 2010a and Ubuntu. But I can’t make it work. This program seems very unstable. When the second session read the shared memory, matlab crashes. So I gave up this method.

There are other methods to make the feedback, but I can’t think of a way that can really solve my problem.

3. Use parallel computing

Another method to make the synchronization is to use parallel computing. A more detailed post can be found here about parallel computing in matlab: https://alenblog.wordpress.com/2011/03/31/parallel-computing-in-matlab/.

Basically, we can have two threads that can call two functions simultaneously. The traditional multithreading is actually through time division. So the time actually not synchronized for the two functions. But I’m not sure what the time difference is. I found a post here, where basically some codes were written in c++ for multithreading, it can be compiled in matlab to MEX file, and then be called. But like in the comment, I got same errors when compiling:

mex mythreadprog.cpp

Error mythreadprog.cpp: 11 syntax error; found `MyThread’ expecting `;’
Error mythreadprog.cpp: 19 redeclaration of `mexFunction’ previously declared at .\mex.h 135
Warning mythreadprog.cpp: 23 assignment of pointer to void function(pointer to void) to pointer to int function(pointer to void)
Error mythreadprog.cpp: 33 syntax error; found `MyThread’ expecting `;’
Warning mythreadprog.cpp: 58 missing return value
Error mythreadprog.cpp: 33 undefined size for `void cdecl’
4 errors, 2 warnings

C:\PROGRA~1\MATLAB\R2007B\BIN\MEX.PL: Error: Compile of ‘mythreadprog.cpp’ failed.

The new multithreading parallel computing using multi-core computer is built in recent matlab versions and can really do the job.

I found different ways to do parallel computing in matlab. The way posted here is using parallel computing, but it doesn’t make the synchronization. The code is shown below:

 
funList = {@fun1,@fun2,@fun3};
dataList = {data1,data2,data3}; %# or pass file names 

parfor i=1:length(funList)
    %# call the function
    funList{i}(dataList{i});
end

Finally, I got a way using matlabpool open two labs, and use spmd to run two functions in two labs separately. labBarrier is used to synchronize the two labs. The code is shown below.

function data = main

struct data1 data2;
data1.x = 1;
data2.x = 2;
funList = {@fun1,@fun2};
dataList = {data1,data2}; % or pass file names 

matlabpool open 2
spmd
    labBarrier
	funList{labindex}(dataList{labindex})
end
matlabpool close

function y = fun1(data1)
time1=clock;
fprintf('%02d : %02d : %02f\n',time1(4),time1(5),time1(6))
labindex
pause(5)
disp('run f1');

function y = fun2(data2)
time2=clock;
fprintf('%02d : %02d : %02f\n',time2(4),time2(5),time2(6))
labindex
pause(5);
disp('run f2')

We can see the two functions are starts at the same time. But the clock command only shows to the accuracy of 1ms.

Parallel Computing in Matlab

1. Definition

Here is the definition for multithreading (From Wikipedia):

In computer science, a thread of execution is the smallest unit of processing that can be scheduled by anoperating system. It generally results from a fork of acomputer program into two or more concurrently runningtasks.

On a single processor, multithreading generally occurs by time-division multiplexing (as inmultitasking): the processor switches between different threads. On a multiprocessor or multi-core system, the threads or tasks will actually run at the same time, with each processor or core running a particular thread or task.

The difference between multithreading and parallel computing has been vague. Different people have different understandings.

Here (http://www.danielmoth.com/Blog/threadingconcurrency-vs-parallelism.aspx), parallel computing is just for multi-core/cpu system.

Here (http://stackoverflow.com/questions/1740304/difference-between-multithreaded-and-parallel-programming), it says “multithreading is a type of parallelism, so parallel programming would include multithreading. Other types of parallelism include multiprocessing (where you use multiple processes that communicate with each other) and distributed computing (where tasks run on different computers).”

Recently when people talk about parallel computing, they usually refer to using muti-core cpu or multiple computers or GPU. But generally speaking, I would rather use the latter one, since people have been using the terminology “parallel” for a long time in a lot of cases, such as parallel port vs serial port. Time division in optical communication is also common, which I think is also considered parallelism.

Here (http://zone.ni.com/devzone/cda/tut/p/id/6424) explained the difference between multithreading and multitasking.

2. Multithreading in matlab

http://www.mathworks.com/support/solutions/en/data/1-WNXI8/index.html?solution=1-WNXI8

Prior to MATLAB 7.4 (R2007a) MATLAB was a single threaded application. So on a hyperthreaded dual processor machine which runs 4 threads, two per CPU, MATLAB will take only 25% of CPU time corresponding to 1 thread.

As of MATLAB 7.4 (R2007a), MATLAB can be multi-threaded for certain applications.

http://www.mathworks.com/support/solutions/en/data/1-46OY0H/index.html?solution=1-46OY0H

Multithreaded computations, introduced in MATLAB R2007a, are turned on by default as of MATLAB R2008a release.

Another words, prior to matlab 7.4, there is no multithreading. Even if you have multi-core computer, it doesn’t help. While for higher version matlabs, multithreading are embedded, so we can benefit from multi-core computer.

I actually found some people made mex files using another language like c++ or java to achieve multithreading in Matlab. For example: http://www.instructables.com/id/Matlab-Multithreading-EASY/.

I was trying to compile it. I tried in Ubuntu with matlab 2010a, and in windows with matlab 2010b. I failed and got the errors as other people got:

mex mythreadprog.cpp

Error mythreadprog.cpp: 11 syntax error; found `MyThread’ expecting `;’
Error mythreadprog.cpp: 19 redeclaration of `mexFunction’ previously declared at .\mex.h 135
Warning mythreadprog.cpp: 23 assignment of pointer to void function(pointer to void) to pointer to int function(pointer to void)
Error mythreadprog.cpp: 33 syntax error; found `MyThread’ expecting `;’
Warning mythreadprog.cpp: 58 missing return value
Error mythreadprog.cpp: 33 undefined size for `void cdecl’
4 errors, 2 warnings

C:\PROGRA~1\MATLAB\R2007B\BIN\MEX.PL: Error: Compile of ‘mythreadprog.cpp’ failed.

Another multithreading example compiled from c is here:

http://robertoostenveld.ruhosting.nl/index.php/multithreading-matlab-mex/

3. Vectorization

Matlab is a matrix language. Vectors and Matrices are used in matlab to make computing faster and improve performance. You can find more details about the techniques improving performance in Matlab (http://www.mathworks.com/help/techdoc/matlab_prog/f8-784135.html and http://www.mathworks.com/support/solutions/en/data/1-15NM7/?solution=1-15NM7).

As I understand, vectorization is not parallel computing. Vectorization is used to improve performance due to higher memory efficiency. But of course, vector and matrix computing can also use parallel computing.

4. Parallel computing in Matlab

A great article can be found here: https://docs.google.com/viewer?url=http%3A%2F%2Fweb.eecs.utk.edu%2F~luszczek%2Fpubs%2Fparallelmatlab.pdf

The parallel computing toolbox was actually embedded in matlab since R2006b. But multithreading is built-in in recent versions. It seems the multithreading is built for multi-core computers. We won’t benefit from multithreading using a single-core computer. For more information about parallel computing in Matlab, check: http://www.mathworks.com/products/parallel-computing/, “Parallel Computing Toolbox™ lets you solve computationally and data-intensive problems using multicore processors, GPUs, and computer clusters.”, http://www.mathworks.com/help/toolbox/distcomp/f3-6010.html and http://www.mathworks.com/help/toolbox/optim/ug/briutum.html.

More Links for parallel computing matlab:

http://www.sciencedirect.com/science?_ob=ArticleURL&_udi=B6V4G-4WGHJMS-1&_user=699445&_coverDate=11%2F30%2F2009&_rdoc=1&_fmt=high&_orig=gateway&_origin=gateway&_sort=d&_docanchor=&view=c&_searchStrId=1699822384&_rerunOrigin=google&_acct=C000039258&_version=1&_urlVersion=0&_userid=699445&md5=85820ea2f90115e9f443437913a5f2b2&searchtype=a

https://docs.google.com/viewer?url=http%3A%2F%2Fwww.serc.iisc.ernet.in%2FComputingFacilities%2Fsoftware%2Fmatlab-7.11%2Fmatlab_parallel_toolbox%2Fdistcomp.pdf

http://www.mathworks.com/products/parallel-computing/builtin-parallel-support.html

http://www.mathworks.com/products/distriben/

http://www.mathworks.com/support/product/DM/installation/ver_current/

Even though R2006b was single-threaded, people can benefit from the parallel computing toolbox with network cluster computers and distributed arrays whichhas better memory efficient usage. “each lab has to store and process means a more efficient use of memory and faster processing”.

“Parallel for-loops let you run a for-loop across your labs simultaneously.” One thing I don’t understand is that when I tried Parallel for-loops  on R2006b, I can only have one lab, but parfor still runs faster than for loop.

5. Some test results

Note: Factors That Affect Speed (http://www.mathworks.com/help/toolbox/optim/ug/briutum.html)

a. Parallel overhead. There is overhead in calling parfor instead of for. If function evaluations are fast, this overhead could become appreciable. In particular, solving a problem in parallel can be slower than solving the problem serially.

b. No nested parfor loops. When executing serially, parfor loops run slower than for loops. Therefore, for best performance, ensure that only your outermost parallel loop calls parfor.

Here are some testing codes and results using Matlab R2010b in Windows XP.

Codes:

%%%%%%% 1 %%%%%%%

clear all

M=zeros(1,100000);

tic

for i=1:100000

M(i)=i;

end

toc

%%%%%%%% 2 %%%%%%%%%%%%

clear all

tic

for i=1:100000

M(i)=i;

end

toc

%%%%%%%%%3 %%%%%%%%%%%

clear all

M=zeros(1,100000);

tic

parfor i=1:100000

M(i)=i;

end

toc

%%%%%%%% 4 %%%%%%%%%%%%

clear all

tic

parfor i=1:100000

M(i)=i;

end

toc

%%%%%%%% 5 %%%%%%%%%%%%

clear all

tic

M = 1:100000;

toc

%%%%%%%%% 6 %%%%%%%%%%%

clear all

matlabpool open 2

tic

parfor i=1:100000

M(i)=i;

end

toc

matlabpool close

%%%%%%%%% 7 %%%%%%%%%%%

clear all

matlabpool open 2

tic

parfor i=1:100

fprintf(‘labindex = %d\n’, labindex);

M(i)=i;

end

toc

matlabpool close

%%%%%%%%% 8 %%%%%%%%%%%

clear all

matlabpool open 2

tic

spmd

labindex

end

tocmatlabpool close

%%%%%%%%%%%%%%%%%%%%

Results:

1) regular for loop using pre-allocation:

Elapsed time is 0.000813 seconds.

2) regular for loop without pre-allocation:

Elapsed time is 20.986812 seconds.

3) parfor-loop with pre-allocation

Elapsed time is 0.157174 seconds.

4) parfor-loop without pre-allocation

Elapsed time is 0.157019 seconds.

5) using vector

Elapsed time is 0.000505 seconds.

6) open a work pool of size 2 (the most I can create with my computer)

Elapsed time is 0.261995 seconds.

7) labindex = 1

8 ) Lab 1: 1, Lab 2: 2

In R2006, 4) is also faster than 2). Like I said, I don’t know the exact answer. Here, 4) is faster than 2) too in Matlab 2010b, without opening multiple worker labs. Basically I don’t how parfor-loop works. It’s called parallel simoutaneously. Different iterations can be run at the same time, and the order is not important. But if there’s only a single thread (R2006b), is it using multi-core of CPU? How does it do that if I don’t open multiple work labs using matlabpool. If it’s using a single core and single thread, how is the “parallelism” done? It seems the answer has to be “a cpu-core can be used without creating a worker lab.” But when I checked the processes for 1) through 4), only one matlab process is running which takes 50% cpu resources.

3) and 4) are about the same and a little slower than 1). This may be because the parfor parallel processing takes about the same time as pre-allocation in 1), but then parfor loop has its overhead added, which make it a little slower.

In 6) When matlabpool open is used, the parfor-loop even takes longer. With or without pre-allocation are about the same. And 7) shows all parfor loops are performed in lab 1. Lab 2 is not used. I don’t understand wny lab 2 is not used either. Another thing is that I have noticed in 6) and 7), there are totally 3 matlab processes, one of them is about 0%, each of the rest two takes about 50%. Another words, when two worker labs are created, two cpu cores are used. In total it takes 100% CPU. But when I show the labindex for each parfor loop, the labindex is always 1. And using two cores is not faster than just using one core. It seems one core is working for nothing. I tied to change in 6) from “parfor i=1:100000” to “parfor (i=100000,1)”, it takes about the same time.

It seems I’m missing some point here.
Using spmd in 8 ) seems to have two worker working for sure.

Matlab: Read matrix data from a file

In matlab, there are many commands we can use to read data from a file, such as fscanf, fread, dlmread, load, textscan, fgetl, fgets etc.

Sometimes, I use fscanf, which can read a certain number of elements in different format into variables, such as integer, double, character etc.

Now if I can want to read a numeric matrix from a file. If there are no characters, then many commands such as fscanf, dlmread and load can work.

If there is a header which are characters. But you just want to read the numeric matrix. For example

x     y     z

1     2     3

4     5     6

In this case, load can’t work, because when load is used to load a ASCII file, it can only read numeric data.

dlmread works, using:

M = dlmread(FILENAME,DELIMITER,ROW,COLUMN)

Row and column starts from 0. Here ROW should be 1, COLUMN is 0.

fscanf can also work. But we need ignore the first row. So what I do is:

fid = fopen(‘FILENAME’,’r’);

h = fscanf(fid,’%s’,3);

M = fscanf(fid,’%f %f %f’,[3 2]);

Note: load and dlmread will generate a matrix that has the same form as in the text file. But fscanf will be the transpose of it, since it read a row (different columns) from the text file into the 1st dimension of the matrix which will be a column (different rows).

In this case, dlmread will give you

1 2 3

4 5 6

While fscanf will give you

1   4

2   5

3   6

Plot Dynamic MRI Slice Images in Matlab

%% mriplot.m
%  Click on any the three MRI images and press "ENTER" to change slices at new [x,y,z] positions.
%  Press "ESC" and press "ENTER" to exit.
%  by Binlin Wu -- CCNY Physics
%  09/14/2010
load mri.mat;
D1=double(squeeze(D));
DIM = size(D1);
% [X,Y,Z]=meshgrid(1:DIM(2),1:DIM(1),1:DIM(3));
h1=subplot(2,2,1);imagesc(D1(:,:,round(DIM(3)/2)),[min(D1(:)) max(D1(:))]);colormap(gray);title('axial');colorbar;
xlabel('x');ylabel('y');
h2=subplot(2,2,2);imagesc(squeeze(D1(:,round(DIM(2)/2),:)),[min(D1(:)) max(D1(:))]);colormap(gray);title('sagittal');colorbar;
xlabel('z');ylabel('y');
h3=subplot(2,2,3);imagesc(squeeze(D1(round(DIM(1)/2),:,:)),[min(D1(:)) max(D1(:))]);colormap(gray);title('coronal');colorbar;
xlabel('z');ylabel('x');
% subplot(2,2,4);slice(X,Y,Z,D1,64,64,14);colormap(gray);shading flat;title('3D Slices')
% xlabel('x');ylabel('y');zlabel('z');
x=round(DIM(2)/2);y=round(DIM(1)/2);z=round(DIM(3)/2);
button = 0;
while(1)
try
[A,B,button]=ginput;
catch
return
end
if length(A)==0
A=14;B=64;button=0;
end
A=A(end);
B=B(end);
button=button(end);
A=ceil(A-0.5);
B=ceil(B-0.5);
if button==27
break;
end
if gca==h1
x=A;
y=B;
if x <= 0 || x > DIM(2) || y <=0 || y> DIM(1)
continue
end
axes(h2);imagesc(squeeze(D1(:,x,:)),[min(D1(:)) max(D1(:))]);colormap(gray);title('sagittal');colorbar;
xlabel('z');ylabel('y')
axes(h3);imagesc(squeeze(D1(y,:,:)),[min(D1(:)) max(D1(:))]);colormap(gray);title('coronal');colorbar;
xlabel('z');ylabel('x')
%         subplot(2,2,4);slice(X,Y,Z,D1,x,y,z);colormap(gray);shading flat;title('3D Slices')
%         xlabel('x');ylabel('y');zlabel('z');
elseif gca==h2
z=A;
y=B;
if z <= 0 || z > DIM(3) || y <= 0 || y > DIM(1)
continue
end
axes(h1);imagesc(D1(:,:,z),[min(D1(:)) max(D1(:))]);colormap(gray);title('axial');colorbar;
xlabel('x');ylabel('y')
axes(h3);imagesc(squeeze(D1(y,:,:)),[min(D1(:)) max(D1(:))]);colormap(gray);title('coronal');colorbar;
xlabel('z');ylabel('x')
%         subplot(2,2,4);slice(X,Y,Z,D1,x,y,z);colormap(gray);shading flat;title('3D Slices')
%         xlabel('x');ylabel('y');zlabel('z');
elseif gca==h3
z=A;
x=B;
if x <= 0 || x > DIM(2) || z <= 0 || z > DIM(3)
continue
end
axes(h1);imagesc(D1(:,:,z),[min(D1(:)) max(D1(:))]);colormap(gray);title('axial');colorbar;
xlabel('x');ylabel('y')
axes(h2);imagesc(squeeze(D1(:,x,:)),[min(D1(:)) max(D1(:))]);colormap(gray);title('sagittal');colorbar;
xlabel('z');ylabel('y')
%         subplot(2,2,4);slice(X,Y,Z,D1,x,y,z);colormap(gray);shading flat;title('3D Slices')
%         xlabel('x');ylabel('y');zlabel('z');
end
end

Something weird in matlab

If you have enough memory, do the following.

a = ones(10000,1000);

Then do something like

Then compare

Case 1:
for i = 1:10
b = sqrt(a.^2);
end

Case 2:

b = sqrt(a.^2);b = sqrt(a.^2);b = sqrt(a.^2);b = sqrt(a.^2);b = sqrt(a.^2);b = sqrt(a.^2);b = sqrt(a.^2);b = sqrt(a.^2);b = sqrt(a.^2);b = sqrt(a.^2);% run in one command line

Case 3:
b = sqrt(a.^2);
b = sqrt(a.^2);
b = sqrt(a.^2);
b = sqrt(a.^2);
b = sqrt(a.^2);
b = sqrt(a.^2);
b = sqrt(a.^2);
b = sqrt(a.^2);
b = sqrt(a.^2);
b = sqrt(a.^2);
% run in separate command lines

You’ll get 1st and 2nd case run almost with the same speed (maybe different, I can’t tell significant difference.) But 3rd case will be much faster.

Why?