a
    bV                     @   s   d dl mZ d dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZ d dlmZ d dlmZ ee
jZi ZG dd dZd	d
 Zdd Zdd Zdd ZG dd dZdS )    )with_statementN)defaultdict)fastaparsergenes_parser	reportingqconfigqutils)
get_logger)run_parallelc                   @   s   e Zd ZdddZdS )FeatureContainer c                 C   s   || _ || _g | _i | _d S N)kindfpathsregion_listchr_names_dict)selfr   r    r   D/home/psgendb/BIRCHDEV/pkg/quast-5.2.0/quast_libs/genome_analyzer.py__init__   s    zFeatureContainer.__init__N)r   )__name__
__module____qualname__r   r   r   r   r   r      s   r   c                   C   s   t S r   )ref_lengths_by_contigsr   r   r   r   get_ref_aligned_lengths   s    r   c                 C   s   i }|D ]&}|j |v r$|j ||j < qd||j < qt|dkrt|dkr||d j  du r| }tjd| |d j |f dd |D ]}||_ |||j < qnHtdd | D rtjd	|  dd nd| v rtjd
|  dd |S )z
    returns dictionary to translate chromosome name in list of features (genes or operons) to
    chromosome name in reference file.
    N   r   zReference name in file with genomic features of type "%s" (%s) does not match the name in the reference file (%s). QUAST will ignore this issue and count as if they match.  indentc                 s   s   | ]}|d u V  qd S r   r   ).0chr_namer   r   r   	<genexpr>8       z)chromosomes_names_dict.<locals>.<genexpr>z{Reference names in file with genomic features of type "%s" do not match any chromosome. Check your genomic feature file(s).zSome of the reference names in file with genomic features of type "%s" does not match any chromosome. Check your genomic feature file(s).)seqnamelenpoploggernoticeallvalueswarning)featureregions	chr_namesZregion_2_chr_nameregionr    r   r   r   chromosomes_names_dict"   s4    
*
r/   c           2         s  t | }t | }t }	tt}
tdt | |  t	j
||d }tjrX|}n|d }t	j
|stjd| d dd dS i }| D ]\}}dg|d	  ||< qt| }tt|d
d dd}g }g }|D ] \}\}}|| || qdgt|  dgt| i }t|}tjr<|r<td |rV|D ]}g ||< qFt|}|D ]}t|dd  d }t|dd  d	 }t|dd	  d }t|dd	  d	 }| d  }| d  }||vr"td| d   W d    d S |rF|| t||||||d t||d	 D ]} d	|| | < qTqfW d    n1 s0    Y  | D ]2}|| D ]} d|| | < qt|| |
|< qtj r|!drt	"| d}!tj#rtj st	j
||d nd}"t|"d}#| D ]\}}|#$|d  d}$td	|d	 D ]n} || |  d	ksp| || v r|$tj%kr|!d	7 }!|#$t&| |$ d t&| d	  d  d}$n|$d	7 }$qL|$tj%kr$|!d	7 }!|#$t&||$ d	 d t&| d  q$W d    n1 s0    Y  |!|	d< d |	t'j(j)d < d |	t'j(j)d < d |	t'j(j*d < d |	t'j(j*d < |D ] }%|%j+sxqfd}&d}'t	j
||d |%j,-  d }(t|(d})|)$dd   |)$d! dgt|%j+ }*t|%j+D ]\} d|*| < g }+j.d u rd"t&j/d	  _.t|D ]Z\},}d#}-|| D ]4}.|.j0j0krHq0j1|.j2ks0|.j1j2krjq0n|.j2j2krj1|.j1kr|*|  d$kr|'d	8 }'d	|*| < |&d	7 }&|.3}/|)$d%j.j2j1|/f  |%j,d&kr|,  d	7  < n |,  d	7  < d}- qhnLt4j1|.j1t5j2|.j2 tj6krX|*|  dkrNd$|*| < |'d	7 }'|+|. |-r0 qhq0|-r qxq|*|  d$krd'fd(d)t|+d*d d+D }/|)$d,j.j2j1|/f  q|%j,d&kr|&|	t'j(j*d < |'|	t'j(j*d < nf|	t'j(j)d  d u r.d|	t'j(j)d < d|	t'j(j)d < |	t'j(j)d   |&7  < |	t'j(j)d   |'7  < |)7  qftdt | d-   fd.d)|D }0fd/d)|D }1|
|	|0 |1ffS )0Nr   z.coordsz	.filteredzFile with alignment coords (z") not found! Try to restart QUAST.r   NNr   r   c                 S   s   t | d d S )Nr   )r$   )xr   r   r   <lambda>`   r"   z%process_single_file.<locals>.<lambda>T)keyreversezAnalysis of genes and/or operons files (provided with -g and -O) requires extensive RAM usage, consider running QUAST without them if memory consumption is critical.|      z?Something went wrong and chromosome names in your coords file (zS) differ from the names in the reference. Try to remove the file and restart QUAST.r#   startendcontigstart_in_contigend_in_contigz	_gaps.txtz	/dev/nullw
 
gaps_count_full_partialZ_genomic_features_z.txtz%s		%s	%s	%s	%s
)zID or #ZStartZEndTypeContigz3==================================================
z# F   z%s		%d	%d	complete	%s
operon,c                    s   g | ]}|  qS r   )format_gene_info)r   block)r.   r   r   
<listcomp>   r"   z'process_single_file.<locals>.<listcomp>c                 S   s   | j S r   )r9   )rJ   r   r   r   r2      r"   )r3   z%s		%d	%d	partial	%s
zAnalysis is finished.c                    s   g | ]} | qS r   r   r   idx)features_in_contigsr   r   rK      r"   c                    s   g | ]} | qS r   r   rL   )operons_in_contigsr   r   rK      r"   )8r   label_from_fpathlabel_from_fpath_for_fnamedictr   intr&   infoindex_to_strospathjoinr   use_all_alignmentsisfileerroritemsr   
read_fastasorted	enumerateappendr$   memory_efficientr*   opensplitstripAlignedBlockrangekeyssumspace_efficientendswithremoveanalyze_gapswritemin_gap_sizestrr   FieldsGENESOPERONSr   r   loweridnumberr#   r:   r9   rI   minmaxmin_gene_overlapclose)2contigs_fpathindexcoords_dirpathgenome_stats_dirpathreference_chromosomesns_by_chromosomes
containersassembly_labelZcorr_assembly_labelresultsref_lengthsZcoords_base_fpathcoords_fpathZgenome_mappingr    chr_lenZcontig_tuplesZsorted_contig_tuplesZsorted_contigs_namesZcontigs_orderrM   name_Zaligned_blocks_by_contig_nameZgene_searching_enabledZ	coordfilelines1e1s2e2Zcontig_nameirA   Z
gaps_fpathZ	gaps_fileZcur_gap_size	container
total_fullZtotal_partialZfound_fpathZ
found_fileZ
found_listZgene_blocks	contig_idZcur_feature_is_foundZ	cur_blockZcontig_infounsorted_features_in_contigsunsorted_operons_in_contigsr   )rN   rO   r.   r   process_single_fileB   s   






4
 &J
 


$


$ r   c           3   	      s  t j|tjddlm} |jr0t jdt	  t
d t jsXt  t| \}t jd}	t|	d}
g  | D ]\}} t|g| q|stjddd	 |rЈ t|d
 ntjddd	  D ]}|jsq|jD ]}| jt||j7  _qt|jdkrXtjd|j d dd	 |
d|j d d d  qtdtt|j d |j d  |
d|j d tt|j d  t|j|jt  |_!qd\}}|D ]v}t"#|}d} D ]B}|jd
krt|j}|$t"j%j&t|j n|t|j7 }q|r|}|$t"j%j'| qi }i }i }i }g }g }g }tj(}t)t|tj*} fddt+|D }t,t-||dd\}|t|t 7 }|t_(st
d |
.  d S D ]&fddt/tD t0< q|
d  D ]\\}} t1t0| }!|
d| d t|  d d  t| t|   d! t|! d"  q|
d |
d#t| d$  |
d%ttj2 d  |
d&ttj3 d$  |
d$ |
d'd(  |
d'd)  |
d* t4||D ]t\}\}"}#}$}%}&t56|}'|$||< |#||< |&||< |%||< |t7|$ |t7|& |"d+ }(|"t"j%j8d,  })|"t"j%j8d-  }*|"t"j%j9d,  }+|"t"j%j9d-  },t"#|}|
d.|'d d/ |:t"j%j;|:t"j%j<|(f  |t=|:t"j%j; t"j%j8|)|*ft"j%j9|+|,ffD ]V\}-}.}/|.d u rD|/d u rD|
d0d1  n&|
d0|.|/f  |$|-d2|.|/f  q|
d q|
.  tj>rdd3l?m@}0 |r|0A||d4|| |r|0A||d5|| tjBrd6d7lCmD}1 dd8lEmF}2 |r2|1G|||d9 d: |1H|| ||2|d; d: |1I||d< d= |r||1G|||d> d5 |1H|| ||2|d? d5 |1I||d@ dA |1jI||dB dCdDdE t
dF  S )GNr   )search_references_metarawzRunning Genome analyzer...zgenome_info.txtr>   zbNo file with genomic features were provided. Use the --features option if you want to specify it.
r   r   rG   zPNo file with operons were provided. Use the -O option if you want to specify it.zNo genomic features of type "z" were loaded.zGenomic features of type "z
" loaded: Noner?   z	  Loaded z genomic features of type ""r0   c              	      s"   g | ]\}}|| fqS r   r   )r   r{   rz   )r   r|   r}   r   r~   r   r   rK   D  s   zdo.<locals>.<listcomp>T)filter_resultsz.Genome analyzer failed for all the assemblies.c                    s   g | ]}|   qS r   r   )r   r   )refr   r   r   rK   P  r"   zreference chromosomes:
	z (total length: z bp, ztotal length without N's: z bp, maximal covered length: z bp)
ztotal genome size: z

zgap min size: zpartial gene/operon min size: z8%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|
)assemblygenomeZduplicationZgapsgenespartialoperonsr   )r   fractionratioru   r   r   r   r   zy========================================================================================================================
rA   rB   rC   z%-25s| %-10s| %-12s| %-10s|   z %-10s| %-10s|)-r   z%s + %s part)
html_saverfeaturesr   r   )plotter)contigs_aligned_lengthsz/features_cumulative_plotzgenomic featuresz/features_frcurve_plotz/complete_features_histogramz# complete genomic featuresz/operons_cumulative_plotz/operons_frcurve_plotz/complete_operons_histogramz# complete operonsz/genome_fraction_histogramzGenome fraction, %d   )	top_valuezDone.)JrV   rW   rX   r   aligner_output_dirname
quast_libsr   is_quast_first_runr&   print_timestamp	main_infoisdirmkdirr   get_genome_statsrb   r\   r`   r   r'   r   r   r   get_genes_from_filer   r$   r*   rm   rT   ro   r/   listrg   r   r   get	add_fieldrp   REF_OPERONS	REF_GENES_num_nf_errorsrv   max_threadsr_   r
   r   ry   rf   r   rw   rn   rx   zipr   name_from_fpathrh   rq   rr   	get_fieldMAPPEDGENOMEDUPLICATION_RATIOfloathtml_reportquast_libs.html_saverr   save_features_in_contigs
draw_plotsr   r   quast_libs.ca_utils.miscr   genes_operons_plotfrc_plot	histogram)3	ref_fpathaligned_contigs_fpathsoutput_dirpathZfeatures_dictZoperons_fpaths detailed_contigs_reports_dirpathr}   r   genome_sizeZresult_fpathZres_filer+   feature_fpathr   fpathZref_genes_numZref_operons_numrz   reportZgenomic_featuresZfiles_features_in_contigsZ"files_unsorted_features_in_contigsZfiles_operons_in_contigsZ!files_unsorted_operons_in_contigsZgenome_mappedZfull_found_genesZfull_found_operonsZnum_nf_errorsn_jobsparallel_run_argsZresults_genes_operons_tuplesr    r   aligned_lenr   r   rN   r   rO   assembly_namerA   Z
genes_fullZ
genes_partZoperons_fullZoperons_partfieldfullpartr   r   r   r   )r   r|   r}   r   r   r   r~   r   do   s6   



&&


$






&


r   c                   @   s   e Zd ZdddZdd ZdS )re   Nc                 C   s(   || _ || _|| _|| _|| _|| _d S r   r8   )r   r#   r9   r:   r;   r<   r=   r   r   r   r     s    zAlignedBlock.__init__c                 C   s   | j | j }}| j|jk r@|j| j }||k r8||7 }n||8 }|j| jk rz|jt|j| j }||k rr|| }n|| }| jd t| d t| S )N:r   )r<   r=   r9   r:   rw   r;   ro   )r   r.   r9   r:   Zregion_shiftZregion_sizer   r   r   rI     s    

zAlignedBlock.format_gene_info)NNNNNN)r   r   r   r   rI   r   r   r   r   re     s   
re   )
__future__r   loggingrV   collectionsr   r   r   r   r   r   r   quast_libs.logr	   quast_libs.qutilsr
   LOGGER_DEFAULT_NAMEr&   r   r   r   r/   r   r   re   r   r   r   r   <module>   s    
  0 6