3
b                 @   s
  d dl mZ d dl mZ d dlmZ d dlmZmZmZ d dl	m
Z
 e
ejZd dlmZ G dd dZG d	d
 d
eZG dd deZG dd deZd+ddZdd Zdd Zdd Zd,ddZdd Zdd Zdd  Zd!d" Zd-d#d$Zd%d& Zd'd( Zd)d* Z dS ).    )with_statement)division)qconfig)is_same_referenceget_ref_by_chromosomeparse_cs_tag)
get_logger)correct_namec               @   sL   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdS )Misassemblyr                           	   
                  N)__name__
__module____qualname__LOCAL	INVERSION
RELOCATIONTRANSLOCATIONINTERSPECTRANSLOCATIONSCAFFOLD_GAPLOCAL_SCAFFOLD_GAP
FRAGMENTEDZPOTENTIALLY_MIS_CONTIGSPOSSIBLE_MISASSEMBLIES
MATCHED_SVPOTENTIAL_MGESCF_INVERSIONZSCF_RELOCATIONZSCF_TRANSLOCATIONZSCF_INTERSPECTRANSLOCATION r)   r)   S/home/psgendb/BIRCHDEV/pkg/quast-5.2.0/quast_libs/ca_utils/analyze_misassemblies.pyr
      s    r
   c               @   s    e Zd Zd	Zdd Zdd ZdS )
StructuralVariations
inversionsrelocationstranslocationsc             C   s   g | _ g | _g | _d S )N)r,   r-   r.   )selfr)   r)   r*   __init__*   s    zStructuralVariations.__init__c             C   s   t | jt | j t | j S )N)lenr,   r-   r.   )r/   r)   r)   r*   	get_count/   s    zStructuralVariations.get_countN)r,   r-   r.   )r   r   r   	__slots__r0   r2   r)   r)   r)   r*   r+   '   s   r+   c               @   sh   e Zd Zd$Zd%ddZedd Zdd Zdd Zdd Z	d&ddZ
dd Zdd Zd d! Zd"d# ZdS )'Mappings1e1s2e2len1len2idyrefcontigcigarns_possv_typeNc          
   C   sP   |||||||||	f	\	| _ | _| _| _| _| _| _| _| _|
| _	|| _
|| _d S )N)r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   )r/   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   r)   r)   r*   r0   6   s    :zMapping.__init__c                s    j    d  d   koL d   koL d   koL d   koLdkn  sZt  d } d } fd	d
dD \}}}}}}	t d }
 d }t||||||	|
|||
S )Nr   r   r   r   r   |r   r   c                s   g | ]}t  | qS r)   )int).0i)liner)   r*   
<listcomp>D   s    z%Mapping.from_line.<locals>.<listcomp>r   r   r   r   r   r   r   r   )r   r   r   r   r   r   )splitAssertionErrorfloatr4   )clsrE   r<   r=   r5   r6   r7   r8   r9   r:   r;   r>   r)   )rE   r*   	from_line<   s    RzMapping.from_linec             C   s@   dj dd | j| jd| j| jd| j| jd| jd| j| j	gD S )N c             s   s   | ]}t |V  qd S )N)str)rC   xr)   r)   r*   	<genexpr>J   s    z"Mapping.__str__.<locals>.<genexpr>rA   )
joinr5   r6   r7   r8   r9   r:   r;   r<   r=   )r/   r)   r)   r*   __str__I   s    (zMapping.__str__c             C   sF   dj dd | j| jd| j| jd| j| jd| jd| j| j	d| j
gD S )NrL   c             s   s   | ]}t |V  qd S )N)rM   )rC   rN   r)   r)   r*   rO   N   s    z%Mapping.coords_str.<locals>.<genexpr>rA   )rP   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   )r/   r)   r)   r*   
coords_strM   s    (zMapping.coords_strc          
   C   s0   dj dd | j| jd| j| jd| j| jgD S )NrL   c             s   s   | ]}t |V  qd S )N)rM   )rC   rN   r)   r)   r*   rO   R   s    z$Mapping.short_str.<locals>.<genexpr>rA   )rP   r5   r6   r7   r8   r9   r:   )r/   r)   r)   r*   	short_strQ   s    zMapping.short_str Truec             C   s4   dj dd | j| j| j| j| j| j| j||g	D S )N	c             s   s   | ]}t |V  qd S )N)rM   )rC   rN   r)   r)   r*   rO   U   s    z,Mapping.icarus_report_str.<locals>.<genexpr>)rP   r5   r6   r7   r8   r<   r=   r;   )r/   	ambiguityis_bestr)   r)   r*   icarus_report_strT   s    zMapping.icarus_report_strc             C   s.   t | j| j| j| j| j| j| j| j| j	| j

S )N)r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   )r/   r)   r)   r*   cloneW   s    zMapping.clonec             C   s   t | j| jS )z&Return start on contig (always <= end))minr7   r8   )r/   r)   r)   r*   startZ   s    zMapping.startc             C   s   t | j| jS )z&Return end on contig (always >= start))maxr7   r8   )r/   r)   r)   r*   end^   s    zMapping.endc             C   s   | j | jk S )z7Returns True for positive strand and False for negative)r7   r8   )r/   r)   r)   r*   
pos_strandb   s    zMapping.pos_strand)r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   )
NNNNNNNNNN)rT   rU   )r   r   r   r3   r0   classmethodrK   rQ   rR   rS   rY   rZ   r\   r^   r_   r)   r)   r)   r*   r4   3   s   

r4   c               @   s    e Zd Zd
Zdd Zdd Zd	S )
IndelsInfo
mismatches
insertions	deletionsindels_listc             C   s   d| _ d| _d| _g | _d S )Nr   )rb   rc   rd   re   )r/   r)   r)   r*   r0   j   s    zIndelsInfo.__init__c             C   sD   |  j |j 7  _ |  j|j7  _|  j|j7  _|  j|j7  _| S )N)rb   rc   rd   re   )r/   otherr)   r)   r*   __add__p   s
    zIndelsInfo.__add__N)rb   rc   rd   re   )r   r   r   r3   r0   rg   r)   r)   r)   r*   ra   g   s   ra   Nc             C   s   |j | j d }| j |j d }| j r6|j r6|}n0| j  rP|j  rP|}n|j | j krb|}n|}d}|d k	r|}| j|jk r|| tjk r|| }n"| j|jkr|| tjk r|| }t|t|k r|}d}||fS )Nr   FT)r5   r6   r_   r   extensive_misassembly_thresholdabs)align1align2Zcyclic_ref_lendistance_align1_align2distance_align2_align1Zdistancecyclic_momentZcyclic_distancer)   r)   r*   distance_between_alignmentsx   s(    
ro   c             C   sJ   |j | j d }| j |j d }|j | j kr:td| }ntd| }|S )Nr   r   )r5   r6   r]   )rj   rk   rl   rm   Zoverlapr)   r)   r*   cyclic_back_ends_overlap   s    rp   c             C   sL   | j  r|| j | j n| jd }|j  r4|jd n||j |j }||gS )Nr   )r_   r<   r6   r5   )rj   rk   ref_lensZgap1Zgap2r)   r)   r*   __get_border_gaps   s    ""rr   c             C   sR   | j |j krdS tjrNtjr0t| j |j  r0dS tdd t| ||D rNdS dS )NFc             S   s   g | ]}|t jkqS r)   )r   fragmented_max_indent)rC   dr)   r)   r*   rF      s    z8is_fragmented_ref_fake_translocation.<locals>.<listcomp>T)r<   r   check_for_fragmented_refis_combined_refr   allrr   )rj   rk   rq   r)   r)   r*   $is_fragmented_ref_fake_translocation   s    rx   Fc             C   s  |j  | j  d }|r$|t|7 }|r,|nd }	|	d k	r\| j|jkr\t| ||	| j \}
}nt| |\}
}d}|dk r|
dkr| }n|
 | k r|
| }| j| jk}|j|jk}|
| }||||ddd}|rt||| |rd|d< d|fS |ot| |||rd|d< d|fS |r4t	t
| |||d< d|fS |r^t| |t|kr^t| | |d< | j|jkst|d tjks||krd|fS d|fS )	Nr   r   F)inconsistencydistance_on_contigmisassembly_internal_overlaprn   is_svis_scaffold_gapTr}   r|   ry   )r\   r^   r1   r<   ro   r7   r8   check_is_scaffold_gapcheck_svsumrr   rp   ri   r   rh   )rj   rk   
contig_seqrq   	is_cyclicregion_struct_variationsis_fake_translocationis_cyclic_contigrz   cyclic_ref_lensdistance_on_referencern   r{   Zstrand1Zstrand2ry   aux_datar)   r)   r*   is_misassembly   sD    
"
r   c                s  ddt jd }fdd  fdd} fdd	}| j|jkrtx*|jD ] }|| ||sh||| |rLd
S qLW nt| j| jk |j|jk krt|t jk rx6|jD ],}| j|d jkr|| |s|||rd
S qW n|j}|j	| j	k r|j	| j
 }	}
n| j
|j	 }	}
xt|D ]\}}|d j| jkr |	|d r |
|d rNd
S |d jdkr|d j
}|d }xp|t|k r|| d j	| |kr|| d j| jkr|| } |
|d rd
S |d j
}|d7 }qrW qW dS )Nd      r   c                s6   |j dkrn }|j| |   ko0|j| kS   S )NQuastDEL)r@   r5   r6   )possvZ	max_error)max_error_svmax_error_trivial_delr)   r*   
__match_ci   s    zcheck_sv.<locals>.__match_cic                sH   |d j | j krD|d j |j krD | j|d rD |j|d rDdS d S )Nr   r   T)r<   r6   r5   )rj   rk   r   )r   r)   r*   __check_translocation   s      z'check_sv.<locals>.__check_translocationc                sx    | j |d r:|d j | j  ko0|d jkn  r:dS  | j|d rt|d j | j   koj|d jkn  rtdS d S )Nr   r   T)r5   r6   )alignr   )r   r)   r*   __check_inversion   s    66z#check_sv.<locals>.__check_inversionTr   r   r   F)r   rh   r<   r.   r7   r8   ri   r,   r-   r5   r6   	enumerater@   r1   )rj   rk   ry   r   max_gapr   r   r   Z
variationsZsv_startZsv_endindexZprev_endZindex_variationr)   )r   r   r   r*   r      sD    
&$"

r   c             C   s6  | sd S t  }t| }x
|D ] }|jd}|jds"ytt|d t|d t|d |d d}tt|d t|d	 t|d
 |d d}|j|jkr|jj	||f nVd|d kr|j
j	||f n8d|d ksd|d ksd|d kr
|jj	||f n W q" tk
r"   Y q"X q"W W d Q R X |S )NrV   #r   r   r   r   )r5   r6   r<   r@   r   r   r   ZINVZDELZINSZBND)r+   openrG   
startswithr4   rB   r	   r<   r.   appendr,   r-   
ValueError)	bed_fpathr   frE   fsrj   rk   r)   r)   r*   find_all_sv  s(    

,,&r   c             C   s$  g }xt |t | k rt |}| |  sJ|d t | ksJ| |d   rV|jd q| | d \}}}}| |d  d \}}}	||	ko|tjko|j|jko|j|jk |j|jk kr|j |j  }
}||
 d }t	||\}}|| }t
|tjkr|jd |jd q|jd qW |S )Nr   Fr   T)r1   r   r   rh   r<   r7   r8   r^   r\   ro   ri   )misassembliesis_potential_mgeidxrj   start_in_refZms_typeZmge_lenrk   
end_in_refZms_type2Zstart_in_contigZend_in_contigrz   r   rn   ry   r)   r)   r*   detect_potential_mge0  s(    (


r   c             C   sR   t | tjkrN|j|jkrN|j |j krN|j |j|jk krNt|||rNdS dS )NTF)ri   r   scaffolds_gap_thresholdr<   r_   r5   is_gap_filled_ns)ry   r   rj   rk   r)   r)   r*   r~   H  s
    $r~   c       
      C   s   ddd}dd }dd }|j  | j  d }|dkr:dS | j}|d k	r`d	| |d |d
 f nd}| j|jkr||| j d d}	|||| j d |	7 }n,|| |j  d d}	||| |j  d |	7 }|| j |fS )Nc       
      S   sL  d}| j }| j | jk rdnd}d}| js,|S xt| jD ]}|jdrx|rV||ksb|rj||krj||7 }|d| 7 }q:|jdrt|dd  }nt|d }|}	|r|||  |ks||kr|| |d	kr|nd }	n8|o||k p|||  |k r||dkr|nd | }	|	dk r\|jds4||| 7 }|jdrH||8 }|jdr:||7 }q:|jdr||| 7 }|||	 7 }|r|d|d|	|  d   7 }n|r>||d |	d  7 }q:|jdr|||	 8 }|r|d|d|	|  d   7 }n|r>||d |	d  7 }q:|jdr:||| 7 }|dt|	 7 }q:W || _|S )
Nzcs:Z:r   r   *:-+r   )r7   r8   r>   r   r   rB   r1   rM   )
r   	new_startnew_endZ	new_cigarZctg_posstrand_directiondiff_lenopn_basesZcorr_n_basesr)   r)   r*   __shift_cigarR  s\    




z0exclude_internal_overlaps.<locals>.__shift_cigarc             S   s   d| j   }| j| jk rJ|  j|| j | 7  _|| _| j| j d | _n0|  j|| j | 8  _|| _| j| j d | _| j| j d | _|d| j   7 }|S )Nz%sr   z --> %s
)rS   r7   r8   r5   r:   r6   r9   )r   r   r   align_modificationr)   r)   r*   __shift_start  s    z0exclude_internal_overlaps.<locals>.__shift_startc             S   s   d| j   }| j| jk rJ|  j| j| | 8  _|| _| j| j d | _n0|  j| j| | 7  _|| _| j| j d | _| j| j d | _|d| j   7 }|S )Nz%sr   z --> %s
)rS   r7   r8   r6   r:   r5   r9   )r   r   r   r   r)   r)   r*   __shift_end  s    z.exclude_internal_overlaps.<locals>.__shift_endr   r   zF			Excluding internal overlap of size %d between Alignment %d and %d: r   rT   )r   )r   )NN)r   N)r\   r^   r:   )
rj   rk   rD   r   r   r   rz   Z	prev_len2overlap_msgr   r)   r)   r*   exclude_internal_overlapsP  s    
4$r   c             C   s2   | |j  |j d  }|jd}|t|| fS )Nr   N)r^   r\   countr1   )r   rj   rk   gap_in_contigZns_countr)   r)   r*   "count_ns_and_not_ns_between_aligns  s    
r   c             C   s&   | |j  |j d  }dtj |kS )Nr   r   )r^   r\   r   Ns_break_threshold)r   rj   rk   r   r)   r)   r*   r     s    r   c       1   	   C   s	  d}| d }|j }d}d}t }d}g }g }x*tt| d D ]}| |d  }t|||}t|||\}}t||||||
|\}}d}|r4t|jt|j }}|j|jkrt	j
r||krd}qd}nt|d t	jkrd}nd	}|j|jkr|j|j }} n|j|j }} |j||||j f|| |fg n
|jg  |j|||||f |}q@W d }!t	jrnt|}!| d }g }"xtt| d D ]}| |d  }|| \}}}}}|r||8 }|jj| |d }#|d
 }$||d 7 }|d }%|jj|j d  |jjd|d t|f  |j|jg j| |jj|j d  t|jt|j }}|d r|jjd |jjd |jtj n|d r@t|#t	jkrd}&|jtj || jtj |jjd|& d  n4d}&|jtj || jtj |jjd|& d  |jjd |jjdt|# |& d  n|r~|!r~|!| r~|jjd |jjd |jtj n|rJd}|d7 }|"j| d}|s|jj|j d  d}|jjd |jjd d}'|dkrtj!}(|	| |  d7  < |	| |  d7  < nD|dkr"tj"}(n2|dkrNtj#}(dt|# |%rFdnd }'ntj$}(|j|( || j|( |(tj!kr|| j|( t%|||r|d7 }|j|(tj&tj$   |jj||'  |jj||'  |jj||'  |jjd  |jjd!|j'|j(|j'|j(f d  |jjd d"|j|ji |j< d"|j|ji |j< nd|%rVdnd |j|jkrld#nd })|#dkr|%r|jjd$|) d%  |jjd&|) d  nb|#dkr|j|jkr|jjd$|) d%  |jtj) || jtj) |jjd&|) d  nt|#t	j*k rZt+|||d t,t	j-t	j*d krZt+|||\}*}+|#dkr|jjd'|+|*f |) d  | j.|+7  _.|jjd(|) d  nt|#},|,t	j/krd)nd*}-|#dk rd+nd,}.t,d|+|, }/|jjd-|-|.|,|/|*f |) d  |j0j|, |.d+kr | j1|,7  _1n| j2|,7  _2| j.|/7  _.|jjd.|-j3  |) d  nt	j4rp|"j| d}|$dk r|jjd/ n<|$dkr|jjd0 n$|#dk r|jjd/ n|jjd0 |jjd1t|# |) d  |jjd2|) d  |jtj5 || jtj5 |}||j |$dk r*|$ nd 7 }qW t| d }|jjd3|d t|f d  |jj|j d  |j|jg j| |jj|j d  |"j| t6|"}0|	rxt|"dk	rx| d7 j| d jk	rx|0d4|| d j  k	rxt| d8 | d |||d|
d5\}}| 	rx|d  	rx|d  	rxt|d }#t	j4 	sX|#t	j*k 	rx|"d  |"d9 7  < |"d d: }"|j7|" |0t|k	st8d6| d j |0t|f |||||0fS );Nr   Fr   rT   zinterspecies translocationZtranslocationry   Z
relocationZ	inversionrz   r{   rn   
z			Real Alignment %d: %s
r|   zY			  Not a misassembly (structural variation of the genome) between these two alignments
z=fake: not a misassembly (structural variation of the genome)
r}   z (extensive)z(fake: scaffold gap size wrong estimationz (local)z0			  Scaffold gap between these two alignments, z1gap lengths difference (reference vs assembly) = zT			  Not a misassembly (possible transposable element) between these two alignments
z8fake: not a misassembly (possible transposable element)
TzExtensive misassembly (z			  Extensive misassembly (z, inconsistency = z+ [linear representation of circular genome]z, scaffold gap is presentz) between these two alignments
z) between %s %s and %s %sMz$ [fragmentation of reference genome]z			  Not a misassemblyz between these two alignments
zfake: not a misassemblyzM			  Stretch of %d mismatches between these two alignments (number of Ns: %d)zindel: stretch of mismatcheszIndel (<= 5bp)zIndel (> 5bp)Z	insertionZdeletionzW			  %s between these two alignments: %s of length %d; %d mismatches (number of Ns: %d)zindel: z>			  Overlap between these two alignments (local misassembly).z:			  Gap between these two alignments (local misassembly).z Inconsistency = zlocal misassemblyz			Real Alignment %d: %sgffffff?)r   r   zkInternal QUAST bug: contig aligned length is greater than contig length (contig: %s, len: %d, aligned: %d)!r   r   r   r   )9r:   ra   ranger1   rx   r   r   r   r<   r   rv   ri   rh   r5   r6   r   large_genomer   Zstdout_fwriteZicarus_out_frY   rM   
setdefaultZcoords_filtered_frR   r
   r&   r"   r#   r'   Zmisassembly_fr=   r!   r    r   r   r   r(   r7   r8   r$   local_misassembly_min_lengthr   r]   min_alignmentrb   SHORT_INDEL_THRESHOLDre   rc   rd   lower	strict_NAr   r   extendrH   )1Zsorted_alignsr   aligned_lengthsZregion_misassembliesrq   Z
ref_alignsZref_featuresr   Zmisassemblies_by_refZistranslocations_by_refr   	ca_outputr{   
prev_alignZcur_aligned_lengthis_misassembledZcontig_is_printedZindels_infoZcnt_misassembliesr   Zmisassembly_inforD   Z
next_alignr   Zinternal_overlapr   Zis_extensive_misassemblyr   Zmisassembly_typeZprev_refZnext_refr   r   r   Zcontig_aligned_lengthsry   rz   rn   Zscaff_gap_typemsgZmisassembly_idZ
reason_msgZ	ns_numberZnot_ns_numberZindel_lengthZindel_classZ
indel_typerb   Zcontig_aligned_lengthr)   r)   r*   process_misassembled_contig  s^   
 


 





$$





$ 
B 
r   )N)FNFF)N)!
__future__r   r   
quast_libsr   quast_libs.ca_utils.miscr   r   r   quast_libs.logr   LOGGER_DEFAULT_NAMEloggerquast_libs.qutilsr	   r
   objectr+   r4   ra   ro   rp   rr   rx   r   r   r   r   r~   r   r   r   r   r)   r)   r)   r*   <module>   s.   
4

,6
c