3
bK                 @   s   d dl mZ d dl mZ d dlZd dlZd dlmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ ee	jZdZe	jd	 ZdddZdd Zdd Zdd Zdd Zdd Zdd ZdS )    )with_statement)divisionN)join)fastaparserqconfigqutils	reportingplotter)set_window_size)
get_logger      Fc                s  d}d}t dtj d }dd td|D }dg| }t dtj d }dd td|D }dg| }	d}
|r|
||	f||ffS xtj| D ]\}tjd }|sqjd	jd
 }d| | }|t |tj   d7  < tj	 x` fddtdt D D ]<}t
|}|dk	r|	t t |tj tj   d7  < qW ||7 }||7 }qW |dkrtd}
n|d | }
|
||	f||ffS )ze
       Returns percent of GC for assembly and GC distribution: (list of GC%, list of # windows)
    r   d      c             S   s   g | ]}|t j qS  )r   GC_contig_bin_size).0ir   r   @/home/psgendb/BIRCHDEV/pkg/quast-5.2.0/quast_libs/basic_stats.py
<listcomp>   s    zGC_content.<locals>.<listcomp>c             S   s   g | ]}|t j qS r   )r   GC_bin_size)r   r   r   r   r   r   !   s    NNGCg      Y@c             3   s   | ]}||   V  qd S )Nr   )r   r   )nseq_fullr   r   	<genexpr>1   s    zGC_content.<locals>.<genexpr>)intr   r   ranger   r   
read_fastalencountGC_window_sizeget_GC_percent)contigs_fpathskipZtotal_GC_amountZtotal_contig_lengthZGC_contigs_bin_numZGC_contigs_distribution_xZGC_contigs_distribution_yZ
GC_bin_numGC_distribution_xGC_distribution_ytotal_GCnameZcontig_ACGT_lenZcontig_GC_lenZcontig_GC_percentseq
GC_percentr   )r   r   r   
GC_content   s:    

&
*
r,   c             C   sZ   t | tk rd S t | | jd }|t | d k r6d S | jd| jd }d| | }|S )Nr   r   r   r   r   )r    MIN_GC_WINDOW_SIZEr!   )r*   ZACGT_lenZGC_lenr+   r   r   r   r#   @   s    r#   c       
      C   s   d}t jrt jnt j}t|d}xtj| D ]~\}}|jd| d t| d  xVt	dt
||D ]B}||||  }t|}	|	d k	rf|jt|d t|	 d  qfW q.W W d Q R X d S )Nr   w# 
)r   large_genomeGC_window_size_larger"   openr   r   writestrr   r    r#   )
	ref_fpathgc_fpathZ	chr_indexwindow_sizeout_fr)   r   r   r*   r+   r   r   r   save_icarus_GCM   s    r;   c       
      C   s   t |}t|d}xtj| D ]r\}}xhtdt||D ]T}||||  }t|}	|	d k	r:|jdj|t	|t	|t| t	|	d g q:W q W W d Q R X d S )Nr.   r   	r1   )
r
   r4   r   r   r   r    r#   r5   r   r6   )
r7   reference_lengthr8   r9   r:   r)   r   r   r*   r+   r   r   r   save_circos_GCZ   s    r>   c             C   sX  d}g }g }g }g }t dd | D }xt| |D ]\}}	t|}
g }x$t|D ]\}}|j|g|  qPW ||
d  }||
d  }||
d d  }|| }|jt|d|   |jt|d|   |jtd| |	d   q2W t t|d	}t t|d
}tt ||}|| | |k r>|d	kr>t || | d	}||| 8 }||| 8 }|| d	 }|| tk r|td 8 }|td 7 }|| d	 }d
}||kr|| d	 }||8 }nd
}xt| D ]\}}|jd
gt|  x`t|D ]T\}}|| | }||k rd
}n||kr"|d	 }|t| t|  |7  < qW qW |||||fS )Nr   c             s   s   | ]}t |V  qd S )N)r    )r   vr   r   r   r   k   s    z#binning_coverage.<locals>.<genexpr>   r      g      ?g      ?r   r   gUUUUUU?)	maxzipsum	enumerateextendappendr   minMIN_HISTOGRAM_POINTS)
cov_valuesZnums_contigsZmin_bins_cntZ	bin_sizesZlow_thresholdsZhigh_thresholdsZcov_by_binsmax_covvaluesnum_contigsZassembly_lenZbases_by_covZcoveragebasesZq1Zq2Zq3Ziqrbin_sizelow_thresholdhigh_threshold
max_pointsoffsetindexZbin_idxr   r   r   binning_coveragee   sZ    



(rU   c                sL  t  }t   fdd|D }x<|D ]4}tj|jtjj||< tj|jtjj |< q$W fdd|D } fdd|D }t||\}}	}
}}dt|	 d }t	j
|||d ||	||
|d x|D ]|}t| g | g\}}	}
}}tj|}tj|}|d	 t|	 d }tjj||d
 }t	j
|g|||d|	||
|d	 qW d S )Nc                s   g | ]} | r|qS r   r   )r   r$   )coverage_dictr   r   r      s    z,draw_coverage_histograms.<locals>.<listcomp>c                s   g | ]} | qS r   r   )r   r$   )rV   r   r   r      s    c                s   g | ]} | qS r   r   )r   r$   )contigs_dictr   r   r      s    zCoverage histogram (bin size: zx)z/coverage_histogram)rO   rK   rP   rQ   z coverage histogram (bin size: Z_coverage_histogramT)	draw_barsrO   rK   rP   rQ   )dictr   get	get_fieldFieldsTOTALLENCONTIGSrU   r6   r	   coverage_histogramr   label_from_fpathlabel_from_fpath_for_fnameospathr   )rV   contigs_fpathsoutput_dirpath	total_lenZcontigs_with_coverager$   rJ   rM   Zcommon_coverage_valuesrO   rP   rQ   rK   Zhistogram_titleZcoverage_valueslabelZ
corr_labelZhistogram_fpathr   )rW   rV   r   draw_coverage_histograms   s,    




rh   c       1         s  t j  t jd tjj|s(tj| d g d }d }d }| r"ttj	| j
 ddt}tt| \}}}	tjstjrt|d}t| | tjrt|d}t| | t jd t j|d k	rdtjj|  d t d	 t| d
 d|  nd |dkrHtj rHt jd n&tjrHtjgt jdt  t jd g }
g }t  tjd}xt|D ]\}}g  |< tj |}t jdtj!| |  g }d}xtj"|D ]\}}|j#t| ||j$d7 }|j%|rt&t'|j%|d }t | |kr: |  dg|t |  d  7  <  | |  t|7  < qW |
j#| |j#| qvW dd |
D }
t(dd |
D }d|tj)d krbdd l*}t&|tj) | fdd|
D }tdkrfddt+dD td  d  g xTt+t|D ]4}t|| }|| j#t|
| | d   q(W ndd |
D }rtj,rddl-m.} |j/| tj,rddl-m.} |j0||| |j1| t jd g }g }d}ddl2m3} xttt4||
|D ]^\}\}}}t5j6|} |j7|\}!}"|j8|}#d6\}$}%rV|j9|\}$}%|j8|}&|j7|tj:\}'}(d7\})}*r|j9|tj:\})}*t|}+t|tj;d\},}-}.|j#|- |j#|. t j|+dkrTdtj!| tj | d  t|! d! t|" d" |#d k	rd#|# nd  d$ t|+ d
 |,d k	r2d|, nd d% d&t'|d' t'|+   nd | j<t5j=j3|! | j<t5j=j>|" | j<t5j=j?|#d k	rd#|# nd  rtj@ r| j<t5j=jA|$ | j<t5j=jB|% | j<t5j=jC|&d k	rd#|& nd  | j<t5j=jD|' | j<t5j=jE|( r>tj@ r>| j<t5j=jF|) | j<t5j=jG|* | j<t5j=jHt| |r| j<t5j=jIt(| t(|t(|}| j<t5j=jJ|+ tj@s| j<t5j=jK|,d k	rd|, nd  | j<t5j=jL| | j<t5j=jMdt'|d' t'|+   | r>| j<t5j=jNt& | j<t5j=jO| tj@sX| j<t5j=jP|d k	r6d| nd  nr| j<t5j=jQt& qW dd l*}|jR|d( d) t_S|}/d }0| rt|/}0|/j#| tj,rtj; rddl-m.} |jT|||/||0 tUjV||tj)k||
t|d*d+g  r4tj@ r4tUjV||tj)k||
t|d,d-fd.dt+t|D  tjrtUjW| ||
t|d/d0 tj;stUjX| ||/t|d1 x6t4||D ](\}}-tUjY||-t|tj |d2  q~W tZ fd3d4|D rt[ || t jd5 ||fS )8Nz%Running Basic statistics processor...T)reversezgc.icarus.txtzgc.circos.txtz  Reference genome:z    z, length = z, num fragments = z	, GC % = z%.2f	undefined   z  Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option. QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).z  Estimated reference length = z  Contig files: z_cov_(\d+\.?\d*)r   r   r   c             S   s   g | ]}t |d dqS )T)ri   )sorted)r   listr   r   r   r      s    zdo.<locals>.<listcomp>c             S   s   g | ]}t |qS r   )r    )r   list_of_lengthr   r   r   r      s    r   c                s&   g | ]  fd dt dD qS )c                s8   g | ]0}| t  k rt |d   |  qS )r   )r    rD   )r   r   )rn   multiplicatorr   r   r      s    z!do.<locals>.<listcomp>.<listcomp>r   )r   )r   )rR   ro   )rn   r   r      s   c                sP   g | ]H}|  t k r4t|d    |   nt|d    d qS )r   N)r    rD   )r   r   )ro   reference_lengthsr   r   r      s   c             S   s   g | ]}t |d dqS )T)ri   )rl   )r   rm   r   r   r   r     s    )
html_saverz  Calculating N50 and L50...)N50)r%   z, N50 = z, L50 = z, auN = z%.1fz, Total length = z, # N's per 100 kbp = z %.2fg     j@i  iX  Nx_plotNxZNGx_plotNGxc                s   g | ]} qS r   r   )r   r   )r=   r   r   r   ^  s    cumulative_plotzCumulative lengthGC_content_plotZ_GC_content_plotc             3   s   | ]} | V  qd S )Nr   )r   r$   )rV   r   r   r   l  s    zdo.<locals>.<genexpr>zDone.)NN)NN)\loggerprint_timestamp	main_inforb   rc   isdirmkdirrl   r   get_chr_lengths_from_fastafilerL   r    rD   r,   r   create_icarus_html
draw_plotsr   r;   draw_circosr>   infobasenamer6   check_for_fragmented_refwarningestimated_reference_sizerY   recompilerE   r   r`   index_to_strr   rG   r!   findallr   floatrB   rR   mathr   html_reportquast_libs.html_saverrq   Zsave_reference_lengthsZsave_contigs_lengthsZsave_tick_x rr   rC   r   rZ   ZN50_and_L50Z	au_metricZNG50_and_LG50x_for_additional_Nxno_gc	add_fieldr\   L50auNis_combined_refNG50LG50auNGrt   Lxru   LGxr^   
LARGCONTIGr]   GCUNCALLEDUNCALLED_PERCENTREFLENREF_FRAGMENTSREFGC	ESTREFLENceilmin_differenceZsave_GC_infor	   rs   rv   rw   contigs_GC_content_plotanyrh   )1r7   rd   re   results_dirZreference_fragmentsicarus_gc_fpathcircos_gc_fpathZreference_GCZreference_GC_distributionZ!reference_GC_contigs_distributionlists_of_lengthsZnumbers_of_NsZcov_patternidr$   assembly_labelrn   Znumber_of_Nsr)   r*   ZcovrM   r   Zcorr_lists_of_lengthsZnum_list
last_indexrq   list_of_GC_distributionsZ list_of_GC_contigs_distributionsZlargest_contigrr   Zlengths_listreportZn50Zl50r   Zng50Zlg50r   ZnxZlxZngxZlgxZtotal_lengthr(   ZGC_distributionZGC_contigs_distributionZ!list_of_GC_distributions_with_refZreference_indexr   )rV   rR   ro   r=   rp   r   do   s,   





B


&
(*
$



"""$$
"
r   )F)
__future__r   r   rb   r   Zos.pathr   
quast_libsr   r   r   r   r	   Zquast_libs.circosr
   quast_libs.logr   LOGGER_DEFAULT_NAMErx   rI   r"   r-   r,   r#   r;   r>   rU   rh   r   r   r   r   r   <module>   s"   


*2