a
    bK                     @   s   d dl mZ d dl mZ d dlZd dlZd dlmZ d dlmZm	Z	m
Z
mZmZ d dlmZ d dlmZ ee	jZdZe	jd	 ZdddZdd Zdd Zdd Zdd Zdd Zdd ZdS )    )with_statement)divisionN)join)fastaparserqconfigqutils	reportingplotter)set_window_size)
get_logger      Fc                    s  d}d}t dtj d }dd td|D }dg| }t dtj d }dd td|D }dg| }	d}
|r|
||	f||ffS t| D ]\}td }|sqd	d
 }d| | }|t |tj   d7  < tj	  fddtdt D D ]<}t
|}|dur|	t t |tj tj   d7  < q||7 }||7 }q|dkrld}
n|d | }
|
||	f||ffS )ze
       Returns percent of GC for assembly and GC distribution: (list of GC%, list of # windows)
    r   d      c                 S   s   g | ]}|t j qS  )r   GC_contig_bin_size.0ir   r   @/home/psgendb/BIRCHDEV/pkg/quast-5.2.0/quast_libs/basic_stats.py
<listcomp>       zGC_content.<locals>.<listcomp>c                 S   s   g | ]}|t j qS r   )r   GC_bin_sizer   r   r   r   r   !   r   NNGCg      Y@c                 3   s   | ]}||   V  qd S Nr   r   nseq_fullr   r   	<genexpr>1   r   zGC_content.<locals>.<genexpr>)intr   r   ranger   r   
read_fastalencountGC_window_sizeget_GC_percent)contigs_fpathskipZtotal_GC_amountZtotal_contig_lengthZGC_contigs_bin_numZGC_contigs_distribution_xZGC_contigs_distribution_yZ
GC_bin_numGC_distribution_xGC_distribution_ytotal_GCnameZcontig_ACGT_lenZcontig_GC_lenZcontig_GC_percentseq
GC_percentr   r   r   
GC_content   s:    

$
(

r0   c                 C   sZ   t | tk rd S t | | d }|t | d k r6d S | d| d }d| | }|S )Nr   r   r   r   r   )r$   MIN_GC_WINDOW_SIZEr%   )r.   ZACGT_lenZGC_lenr/   r   r   r   r'   @   s    r'   c           
      C   s   d}t jrt jnt j}t|d}t| D ]z\}}|d| d t| d  t	dt
||D ]B}||||  }t|}	|	d urb|t|d t|	 d  qbq,W d    n1 s0    Y  d S )Nr   w# 
)r   large_genomeGC_window_size_larger&   openr   r#   writestrr"   r$   r'   )
	ref_fpathgc_fpathZ	chr_indexwindow_sizeout_fr-   r   r   r.   r/   r   r   r   save_icarus_GCM   s    r?   c           
      C   s   t |}t|d}t| D ]n\}}tdt||D ]T}||||  }t|}	|	d ur6|d|t	|t	|t| t	|	d g q6qW d    n1 s0    Y  d S )Nr2   r   	r5   )
r
   r8   r   r#   r"   r$   r'   r9   r   r:   )
r;   reference_lengthr<   r=   r>   r-   r   r   r.   r/   r   r   r   save_circos_GCZ   s    rB   c                 C   sH  d}g }g }g }g }t dd | D }t| |D ]\}}	t|}
g }t|D ]\}}||g|  qL||
d  }||
d  }||
d d  }|| }|t|d|   |t|d|   |td| |	d   q0t t|d	}t t|d
}tt ||}|| | |k r6|d	kr6t || | d	}||| 8 }||| 8 }|| d	 }|| tk r|td 8 }|td 7 }|| d	 }d
}||kr|| d	 }||8 }nd
}t| D ]|\}}|d
gt|  t|D ]T\}}|| | }||k rd
}n||kr|d	 }|t| t|  |7  < qq|||||fS )Nr   c                 s   s   | ]}t |V  qd S r   r$   )r   vr   r   r   r    k   r   z#binning_coverage.<locals>.<genexpr>   r      g      ?gUUUUUU?r   r   )	maxzipsum	enumerateextendappendr!   minMIN_HISTOGRAM_POINTS)
cov_valuesZnums_contigsZmin_bins_cntZ	bin_sizesZlow_thresholdsZhigh_thresholdsZcov_by_binsmax_covvaluesnum_contigsZassembly_lenZbases_by_covcoveragebasesq1Zq2q3iqrbin_sizelow_thresholdhigh_threshold
max_pointsoffsetindexZbin_idxr   r   r   binning_coveragee   sZ    



$r^   c                    sD  t  }t   fdd|D }|D ]4}t|tjj||< t|tjj |< q"fdd|D } fdd|D }t||\}}	}
}}dt|	 d }t	j
|||d ||	||
|d |D ]|}t| g | g\}}	}
}}t|}t|}|d	 t|	 d }tj||d
 }t	j
|g|||d|	||
|d	 qd S )Nc                    s   g | ]} | r|qS r   r   r   r(   coverage_dictr   r   r      r   z,draw_coverage_histograms.<locals>.<listcomp>c                    s   g | ]} | qS r   r   r_   r`   r   r   r      r   c                    s   g | ]} | qS r   r   r_   )contigs_dictr   r   r      r   zCoverage histogram (bin size: zx)z/coverage_histogram)rX   rP   rY   rZ   z coverage histogram (bin size: Z_coverage_histogramT)	draw_barsrX   rP   rY   rZ   )dictr   get	get_fieldFieldsTOTALLENCONTIGSr^   r:   r	   coverage_histogramr   label_from_fpathlabel_from_fpath_for_fnameospathr   )ra   contigs_fpathsoutput_dirpath	total_lenZcontigs_with_coverager(   rO   rR   Zcommon_coverage_valuesrX   rY   rZ   rP   Zhistogram_titleZcoverage_valueslabelZ
corr_labelZhistogram_fpathr   )rb   ra   r   draw_coverage_histograms   s2    



rs   c           1         s  t   t d tj|s(t| d g d }d }d }| r tt	| 
 ddt}tt| \}}}	tjstjrt|d}t| | tjrt|d}t| | t d t |d urdtj|  d t d	 t| d
 d|  nd |dkrFtjsFt d n&tjrFtjgt dt  t d g }
g }t  td}t|D ]\}}g  |< t |}t dt!| |  g }d}t"|D ]\}}|#t| ||$d7 }|%|rt&t'|%|d }t | |kr2 |  dg|t |  d  7  <  | |  t|7  < q|
#| |#| qpdd |
D }
t(dd |
D }d|tj)d krRdd l*}t&|tj) | fdd|
D }tdkrfddt+dD td  d  g t+t|D ]4}t|| }|| #t|
| | d   qndd |
D }rtj,rddl-m.} |/| tj,rddl-m.} |0||| |1| t d g }g }d}ddl2m3} tt4||
|D ]Z\}\}}}t56|} |7|\}!}"|8|}#d\}$}%rB|9|\}$}%|8|}&|7|tj:\}'}(d\})}*rv|9|tj:\})}*t|}+t|tj;d \},}-}.|#|- |#|. t |+dkr@dt!| t | d! t|! d" t|" d# |#d urd$|# nd  d% t|+ d
 |,d urd|, nd d& d't'|d( t'|+   nd | <t5j=j3|! | <t5j=j>|" | <t5j=j?|#d urd$|# nd  rtj@s| <t5j=jA|$ | <t5j=jB|% | <t5j=jC|&d urd$|& nd  | <t5j=jD|' | <t5j=jE|( r&tj@s&| <t5j=jF|) | <t5j=jG|* | <t5j=jHt| |r| <t5j=jIt(| t(|t(|}| <t5j=jJ|+ tj@s| <t5j=jK|,d urd|, nd  | <t5j=jL| | <t5j=jMdt'|d( t'|+   | r&| <t5j=jNt& | <t5j=jO| tj@s@| <t5j=jP|d urd| nd  nr| <t5j=jQt& qdd l*}|R|d) d* t_S|}/d }0| rt|/}0|/#| tj,rtj;sddl-m.} |T|||/||0 tUV||tj)k||
t|d+d,g  rtj@stUV||tj)k||
t|d-d.fd/dt+t|D  tjrtUW| ||
t|d0d1 tj;stUX| ||/t|d2 t4||D ](\}}-tUY||-t|t |d3  q^tZ fd4d5|D rt[ || t d6 ||fS )7Nz%Running Basic statistics processor...Treversezgc.icarus.txtzgc.circos.txtz  Reference genome:z    z, length = z, num fragments = z	, GC % = z%.2f	undefined   z  Reference genome is fragmented. You may consider rerunning QUAST using --fragmented option. QUAST will try to detect misassemblies caused by the fragmentation and mark them fake (will be excluded from # misassemblies).z  Estimated reference length = z  Contig files: z_cov_(\d+\.?\d*)r   r   r   c                 S   s   g | ]}t |d dqS Trt   sortedr   listr   r   r   r      r   zdo.<locals>.<listcomp>c                 S   s   g | ]}t |qS r   rC   )r   list_of_lengthr   r   r   r      r   r   c                    s&   g | ]  fd dt dD qS )c                    s8   g | ]0}| t  k rt |d   |  qS )r   r$   rI   r   )r}   multiplicatorr   r   r      s   z!do.<locals>.<listcomp>.<listcomp>r   )r"   )r   )r[   r   )r}   r   r      s   c                    sP   g | ]H}|  t k r4t|d    |   nt|d    d qS )r   Nr~   r   )r   reference_lengthsr   r   r      s   c                 S   s   g | ]}t |d dqS rx   ry   r{   r   r   r   r     r   )
html_saverz  Calculating N50 and L50...)N50)NN)r)   z, N50 = z, L50 = z, auN = z%.1fz, Total length = z, # N's per 100 kbp = z %.2fg     j@i  iX  Nx_plotNxZNGx_plotNGxc                    s   g | ]} qS r   r   r   )rA   r   r   r   ^  r   cumulative_plotzCumulative lengthGC_content_plotZ_GC_content_plotc                 3   s   | ]} | V  qd S r   r   r_   r`   r   r   r    l  r   zdo.<locals>.<genexpr>zDone.)\loggerprint_timestamp	main_inform   rn   isdirmkdirrz   r   get_chr_lengths_from_fastafilerQ   r$   rI   r0   r   create_icarus_html
draw_plotsr   r?   draw_circosrB   infobasenamer:   check_for_fragmented_refwarningestimated_reference_sizerd   recompilerJ   r   rk   index_to_strr#   rL   r%   findallr!   floatrG   r[   mathr"   html_reportquast_libs.html_saverr   Zsave_reference_lengthsZsave_contigs_lengthsZsave_tick_x r   rH   r   re   ZN50_and_L50Z	au_metricZNG50_and_LG50x_for_additional_Nxno_gc	add_fieldrg   L50auNis_combined_refNG50LG50auNGr   Lxr   LGxri   
LARGCONTIGrh   GCUNCALLEDUNCALLED_PERCENTREFLENREF_FRAGMENTSREFGC	ESTREFLENceilmin_differenceZsave_GC_infor	   r   r   r   contigs_GC_content_plotanyrs   )1r;   ro   rp   results_dirZreference_fragmentsicarus_gc_fpathcircos_gc_fpathZreference_GCZreference_GC_distributionZ!reference_GC_contigs_distributionlists_of_lengthsZnumbers_of_NsZcov_patternidr(   assembly_labelr}   Znumber_of_Nsr-   r.   covrR   r   Zcorr_lists_of_lengthsZnum_list
last_indexr   list_of_GC_distributionsZ list_of_GC_contigs_distributionsZlargest_contigr   Zlengths_listreportZn50Zl50r   Zng50Zlg50r   nxlxZngxZlgxZtotal_lengthr,   ZGC_distributionZGC_contigs_distributionZ!list_of_GC_distributions_with_refZreference_indexr   )ra   r[   r   rA   r   r   do   s   








&
(
 



	"""$$
"
r   )F)
__future__r   r   rm   r   os.pathr   
quast_libsr   r   r   r   r	   Zquast_libs.circosr
   quast_libs.logr   LOGGER_DEFAULT_NAMEr   rN   r&   r1   r0   r'   r?   rB   r^   rs   r   r   r   r   r   <module>   s"   


*2