A simple, self-explanatory matlab code to identify duplicate headers in fasta files.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
| %Simple matlab code to check for the duplicate header in fasta files
%Store the size of header -> unique(header) ->size of new header
%Copy the Table data in excel and compare the two values
clear; clc;
tic;
Path = 'Drive\Path\FileName';%
FileList = dir(Path);
[rFL, cFL] = size(FileList);
for i = 3:rFL %i of 1 & 2 are . & .. respectively
Fas_Fname{i-2,1} = FileList(i).name; %FileList is a structure
end
[rFas,cFas] = size(Fas_Fname);
for i = 1:rFas
clear Header Seq Old_Header Unik_header
OpenFile = cell2mat(strcat(Path,Fas_Fname(i)));
[Header, Seq] = fastaread(OpenFile);[rH,cH] = size(Header);
Old_Header = length(Header);
Unik_header = length(unique(Header));
Table{i,1} = Fas_Fname(i);
Table{i,2} = num2str(Old_Header);
Table{i,3} = num2str(Unik_header);
fprintf('Finished %d\t of ', i);
fprintf('%d\n',rFas);
end
toc;
|
Comments
Post a Comment