a
    b                     @   s
  d dl mZ d dl mZ d dlmZ d dlmZmZmZ d dl	m
Z
 e
ejZd dlmZ G dd dZG d	d
 d
eZG dd deZG dd deZd+ddZdd Zdd Zdd Zd,ddZdd Zdd Zdd  Zd!d" Zd-d#d$Zd%d& Zd'd( Zd)d* Z dS ).    )with_statement)division)qconfig)is_same_referenceget_ref_by_chromosomeparse_cs_tag)
get_logger)correct_namec                   @   sL   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdS )Misassemblyr                           	   
                  N)__name__
__module____qualname__LOCAL	INVERSION
RELOCATIONTRANSLOCATIONINTERSPECTRANSLOCATIONSCAFFOLD_GAPLOCAL_SCAFFOLD_GAP
FRAGMENTEDZPOTENTIALLY_MIS_CONTIGSPOSSIBLE_MISASSEMBLIES
MATCHED_SVPOTENTIAL_MGESCF_INVERSIONZSCF_RELOCATIONZSCF_TRANSLOCATIONZSCF_INTERSPECTRANSLOCATION r)   r)   S/home/psgendb/BIRCHDEV/pkg/quast-5.2.0/quast_libs/ca_utils/analyze_misassemblies.pyr
      s    r
   c                   @   s    e Zd ZdZdd Zdd ZdS )StructuralVariations
inversionsrelocationstranslocationsc                 C   s   g | _ g | _g | _d S Nr,   selfr)   r)   r*   __init__*   s    zStructuralVariations.__init__c                 C   s   t | jt | j t | j S r0   )lenr-   r.   r/   r1   r)   r)   r*   	get_count/   s    zStructuralVariations.get_countN)r   r   r   	__slots__r3   r5   r)   r)   r)   r*   r+   '   s   r+   c                
   @   sh   e Zd ZdZdddZedd Zdd Zd	d
 Zdd Z	dddZ
dd Zdd Zdd Zdd ZdS )Mappings1e1s2e2len1len2idyrefcontigcigarns_possv_typeNc              
   C   sP   |||||||||	f	\	| _ | _| _| _| _| _| _| _| _|
| _	|| _
|| _d S r0   r8   )r2   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   r)   r)   r*   r3   6   s    :zMapping.__init__c                    s        d  d   krN d   krN d   krN d   krNdksXn J   d } d } fd	d
dD \}}}}}}	t d }
 d }t||||||	|
|||
S )Nr   r   r   r   r   |r   r   c                    s   g | ]}t  | qS r)   )int).0iliner)   r*   
<listcomp>D       z%Mapping.from_line.<locals>.<listcomp>)r   r   r   r   r   r   r   r   )splitfloatr7   )clsrJ   r@   rA   r9   r:   r;   r<   r=   r>   r?   rB   r)   rI   r*   	from_line<   s    PzMapping.from_linec                 C   s@   d dd | j| jd| j| jd| j| jd| jd| j| j	fD S )N c                 s   s   | ]}t |V  qd S r0   strrG   xr)   r)   r*   	<genexpr>J   rL   z"Mapping.__str__.<locals>.<genexpr>rE   )
joinr9   r:   r;   r<   r=   r>   r?   r@   rA   r1   r)   r)   r*   __str__I   s    (zMapping.__str__c                 C   sF   d dd | j| jd| j| jd| j| jd| jd| j| j	d| j
fD S )NrQ   c                 s   s   | ]}t |V  qd S r0   rR   rT   r)   r)   r*   rV   N   rL   z%Mapping.coords_str.<locals>.<genexpr>rE   )rW   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   r1   r)   r)   r*   
coords_strM   s    (zMapping.coords_strc                 C   s0   d dd | j| jd| j| jd| j| jfD S )NrQ   c                 s   s   | ]}t |V  qd S r0   rR   rT   r)   r)   r*   rV   R   rL   z$Mapping.short_str.<locals>.<genexpr>rE   )rW   r9   r:   r;   r<   r=   r>   r1   r)   r)   r*   	short_strQ   s    zMapping.short_str Truec                 C   s4   d dd | j| j| j| j| j| j| j||f	D S )N	c                 s   s   | ]}t |V  qd S r0   rR   rT   r)   r)   r*   rV   U   rL   z,Mapping.icarus_report_str.<locals>.<genexpr>)rW   r9   r:   r;   r<   r@   rA   r?   )r2   	ambiguityis_bestr)   r)   r*   icarus_report_strT   s    zMapping.icarus_report_strc                 C   s.   t | j| j| j| j| j| j| j| j| j	| j

S r0   )r7   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   r1   r)   r)   r*   cloneW   s    zMapping.clonec                 C   s   t | j| jS )z&Return start on contig (always <= end))minr;   r<   r1   r)   r)   r*   startZ   s    zMapping.startc                 C   s   t | j| jS )z&Return end on contig (always >= start))maxr;   r<   r1   r)   r)   r*   end^   s    zMapping.endc                 C   s   | j | jk S )z7Returns True for positive strand and False for negative)r;   r<   r1   r)   r)   r*   
pos_strandb   s    zMapping.pos_strand)
NNNNNNNNNN)r[   r\   )r   r   r   r6   r3   classmethodrP   rX   rY   rZ   r`   ra   rc   re   rf   r)   r)   r)   r*   r7   3   s   


r7   c                   @   s    e Zd ZdZdd Zdd ZdS )
IndelsInfo
mismatches
insertions	deletionsindels_listc                 C   s   d| _ d| _d| _g | _d S )Nr   ri   r1   r)   r)   r*   r3   j   s    zIndelsInfo.__init__c                 C   sD   |  j |j 7  _ |  j|j7  _|  j|j7  _|  j|j7  _| S r0   ri   )r2   otherr)   r)   r*   __add__p   s
    zIndelsInfo.__add__N)r   r   r   r6   r3   ro   r)   r)   r)   r*   rh   g   s   rh   Nc                 C   s   |j | j d }| j |j d }|  r6| r6|}n,|  sL| sL|}n|j | j kr^|}n|}d}|d ur|}| j|jk r|| tjk r|| }n"| j|jkr|| tjk r|| }t|t|k r|}d}||fS )Nr   FT)r9   r:   rf   r   extensive_misassembly_thresholdabs)align1align2Zcyclic_ref_lendistance_align1_align2distance_align2_align1distancecyclic_momentZcyclic_distancer)   r)   r*   distance_between_alignmentsx   s(    
rx   c                 C   sJ   |j | j d }| j |j d }|j | j kr:td| }ntd| }|S )Nr   r   )r9   r:   rd   )rr   rs   rt   ru   overlapr)   r)   r*   cyclic_back_ends_overlap   s    rz   c                 C   sL   |   r|| j | j n| jd }|  r4|jd n||j |j }||gS )Nr   )rf   r@   r:   r9   )rr   rs   ref_lensZgap1Zgap2r)   r)   r*   __get_border_gaps   s    ""r|   c                 C   sP   | j |j krdS tjrLtjr.t| j |j s.dS tdd t| ||D rLdS dS )NFc                 S   s   g | ]}|t jkqS r)   )r   fragmented_max_indent)rG   dr)   r)   r*   rK      rL   z8is_fragmented_ref_fake_translocation.<locals>.<listcomp>T)r@   r   check_for_fragmented_refis_combined_refr   allr|   )rr   rs   r{   r)   r)   r*   $is_fragmented_ref_fake_translocation   s    r   Fc                 C   s  |  |   d }|r$|t|7 }|r,|nd }	|	d ur\| j|jkr\t| ||	| j \}
}nt| |\}
}d}|dk r|
dkr| }n|
 | k r|
| }| j| jk}|j|jk}|
| }||||ddd}|rt||| |rd|d< d|fS |rt| |||rd|d< d|fS |r6t	t
| |||d< d|fS |r`t| |t|kr`t| | |d< | j|jkst|d tjks||krd|fS d|fS )	Nr   r   F)inconsistencydistance_on_contigmisassembly_internal_overlaprw   is_svis_scaffold_gapTr   r   r   )rc   re   r4   r@   rx   r;   r<   check_is_scaffold_gapcheck_svsumr|   rz   rq   r   rp   )rr   rs   
contig_seqr{   	is_cyclicregion_struct_variationsis_fake_translocationis_cyclic_contigr   cyclic_ref_lensdistance_on_referencerw   r   Zstrand1Zstrand2r   aux_datar)   r)   r*   is_misassembly   sH    "r   c                    s  ddt jd }fdd  fdd} fdd	}| j|jkrr|jD ]"}|| ||sf||| |rJ d
S qJnn| j| jk |j|jk krt|t jk r|jD ].}| j|d jkr|| |s|||r d
S qn|j}|j	| j	k r|j	| j
 }	}
n| j
|j	 }	}
t|D ]\}}|d j| jkr
 |	|d r
 |
|d rJ d
S |d jdkr
|d j
}|d }|t|k r
|| d j	| |kr
|| d j| jkr
|| } |
|d r d
S |d j
}|d7 }qlq
dS )Nd      r   c                    s6   |j dkrn }|j| |   ko0|j| kS   S )NQuastDEL)rD   r9   r:   )possvZ	max_error)max_error_svmax_error_trivial_delr)   r*   
__match_ci   s    zcheck_sv.<locals>.__match_cic                    sH   |d j | j krD|d j |j krD | j|d rD |j|d rDdS d S Nr   r   T)r@   r:   r9   )rr   rs   r   r   r)   r*   __check_translocation   s     z'check_sv.<locals>.__check_translocationc                    sx    | j |d r:|d j | j  kr2|d jkr:n ndS  | j|d rt|d j | j   krl|d jkrtn ndS d S r   )r9   r:   )alignr   r   r)   r*   __check_inversion   s    66z#check_sv.<locals>.__check_inversionTr   r   r   F)r   rp   r@   r/   r;   r<   rq   r-   r.   r9   r:   	enumeraterD   r4   )rr   rs   r   r   max_gapr   r   r   Z
variationsZsv_startZsv_endindexZprev_endZindex_variationr)   )r   r   r   r*   r      sH    

&
$"

r   c              
   C   sD  | sd S t  }t| }|D  ]}|d}|dsztt|d t|d t|d |d d}tt|d t|d	 t|d
 |d d}|j|jkr|j	||f nVd|d v r|j
	||f n8d|d v sd|d v sd|d v r|j	||f n W q ty   Y q0 qW d    n1 s60    Y  |S )Nr]   #r   r   r   r   )r9   r:   r@   rD   r   r   r   ZINVZDELZINSZBND)r+   openrM   
startswithr7   rF   r	   r@   r/   appendr-   r.   
ValueError)	bed_fpathr   frJ   fsrr   rs   r)   r)   r*   find_all_sv  s(    


,,&(r   c                 C   s   g }t |t | k rt |}| | rB|d t | ksB| |d  sN|d q| | d \}}}}| |d  d \}}}	||	kr|tjkr|j|jkr|j|jk |j|jk kr| |  }
}||
 d }t	||\}}|| }t
|tjkr|d |d q|d q|S )Nr   Fr   T)r4   r   r   rp   r@   r;   r<   re   rc   rx   rq   )misassembliesis_potential_mgeidxrr   start_in_refZms_typeZmge_lenrs   
end_in_refZms_type2Zstart_in_contigZend_in_contigr   r   rw   r   r)   r)   r*   detect_potential_mge0  s*    $
$

r   c                 C   sR   t | tjkrN|j|jkrN| | krN| |j|jk krNt|||rNdS dS )NTF)rq   r   scaffolds_gap_thresholdr@   rf   r9   is_gap_filled_ns)r   r   rr   rs   r)   r)   r*   r   H  s    
r   c           
      C   s   ddd}dd }dd }|  |   d }|dkr:d	S | j}|d ur`d
| |d |d f nd}| j|jkr|||  d d}	||||  d |	7 }n,|| |  d d}	||| |  d |	7 }|| j |fS )Nc           
      S   sH  d}| j }| j | jk rdnd}d}| js,|S t| jD ]}|drt|rR||ks^|rf||krf||7 }|d| 7 }q6|drt|dd  }nt|d }|}	|r|||  |ks||kr|| |dkr|nd }	n:|r||k s|||  |k r||dkr|nd | }	|	dk rZ|ds2||| 7 }|drF||8 }|dr6||7 }q6|dr||| 7 }|||	 7 }|r|d|d|	|  d   7 }n|r<||d |	d  7 }q6|dr|||	 8 }|r|d|d|	|  d   7 }n|r<||d |	d  7 }q6|dr6||| 7 }|dt|	 7 }q6|| _|S )	Nzcs:Z:r   r   *:-+)r;   r<   rB   r   r   rF   r4   rS   )
r   	new_startnew_endZ	new_cigarZctg_posstrand_directiondiff_lenopn_basesZcorr_n_basesr)   r)   r*   __shift_cigarR  sb    

 


z0exclude_internal_overlaps.<locals>.__shift_cigarc                 S   s   d|    }| j| jk rJ|  j|| j | 7  _|| _| j| j d | _n0|  j|| j | 8  _|| _| j| j d | _| j| j d | _|d|    7 }|S Nz%sr   z --> %s
)rZ   r;   r<   r9   r>   r:   r=   )r   r   r   align_modificationr)   r)   r*   __shift_start  s    z0exclude_internal_overlaps.<locals>.__shift_startc                 S   s   d|    }| j| jk rJ|  j| j| | 8  _|| _| j| j d | _n0|  j| j| | 7  _|| _| j| j d | _| j| j d | _|d|    7 }|S r   )rZ   r;   r<   r:   r>   r9   r=   )r   r   r   r   r)   r)   r*   __shift_end  s    z.exclude_internal_overlaps.<locals>.__shift_endr   r   )r   NzF			Excluding internal overlap of size %d between Alignment %d and %d: r   r[   )r   )r   )NN)rc   re   r>   )
rr   rs   rH   r   r   r   r   Z	prev_len2overlap_msgr   r)   r)   r*   exclude_internal_overlapsP  s&    
4r   c                 C   s2   | |  | d  }|d}|t|| fS Nr   N)re   rc   countr4   )r   rr   rs   gap_in_contigZns_countr)   r)   r*   "count_ns_and_not_ns_between_aligns  s    
r   c                 C   s&   | |  | d  }dtj |v S r   )re   rc   r   Ns_break_threshold)r   rr   rs   r   r)   r)   r*   r     s    r   c           1   	   C   s	  d}| d }|j }d}d}t }d}g }g }tt| d D ]}| |d  }t|||}t|||\}}t||||||
|\}}d}|r0t|jt|j }}|j|jkrt	j
r||krd}qd}nt|d t	jkrd}nd	}|j|jkr|j|j }} n|j|j }} |||||j f|| |fg n
|g  ||||||f |}q<d }!t	jrht|}!| d }g }"tt| d D ]}| |d  }|| \}}}}}|r||8 }|j| |d }#|d
 }$||d 7 }|d }%|j| d  |jd|d t|f  ||jg | |j| d  t|jt|j }}|d r|jd |jd |tj n|d r6t|#t	jkrd}&|tj || tj |jd|& d  n4d}&|tj || tj |jd|& d  |jd |jdt|# |& d  n|rt|!rt|!| rt|jd |jd |tj n|r@d}|d7 }|"| d}|s|j|j d  d}|jd |jd d}'|dkrtj!}(|	| |  d7  < |	| |  d7  < nD|dkrtj"}(n2|dkrDtj#}(dt|# |%r<dnd }'ntj$}(||( || |( |(tj!kr||| |( t%|||r|d7 }||(tj&tj$   |j||'  |j||'  |j||'  |jd  |jd!|j'|j(|j'|j(f d  |jd d"||ji |j< d"||ji |j< nd|%rLdnd |j|jkrbd#nd })|#dkr|%r|jd$|) d%  |jd&|) d  nb|#dkr|j|jkr|jd$|) d%  |tj) || tj) |jd&|) d  nt|#t	j*k rPt+|||d t,t	j-t	j*d krPt+|||\}*}+|#dkr|jd'|+|*f |) d  | j.|+7  _.|jd(|) d  nt|#},|,t	j/krd)nd*}-|#dk rd+nd,}.t,d|+|, }/|jd-|-|.|,|/|*f |) d  |j0|, |.d+kr| j1|,7  _1n| j2|,7  _2| j.|/7  _.|jd.|-3  |) d  nt	j4rf|"| d}|$dk r~|jd/ n<|$dkr|jd0 n$|#dk r|jd/ n|jd0 |jd1t|# |) d  |jd2|) d  |tj5 || tj5 |}||j |$dk r |$ nd 7 }qt| d }|jd3|d t|f d  |j| d  ||jg | |j| d  |"| t6|"}0|	rdt|"dk	rd| d4 j| d jk	rd|0d5|| d j  k	rdt| d4 | d |||d|
d6\}}|	sd|d 	sd|d 	sdt|d }#t	j4	rD|#t	j*k 	rd|"d  |"d4 7  < |"d d4 }"|7|" |0t|k	sJ d7| d j |0t|f |||||0fS )8Nr   Fr   r[   zinterspecies translocationZtranslocationr   Z
relocationZ	inversionr   r   rw   
z			Real Alignment %d: %s
r   zY			  Not a misassembly (structural variation of the genome) between these two alignments
z=fake: not a misassembly (structural variation of the genome)
r   z (extensive)z(fake: scaffold gap size wrong estimationz (local)z0			  Scaffold gap between these two alignments, z1gap lengths difference (reference vs assembly) = zT			  Not a misassembly (possible transposable element) between these two alignments
z8fake: not a misassembly (possible transposable element)
TzExtensive misassembly (z			  Extensive misassembly (z, inconsistency = z+ [linear representation of circular genome]z, scaffold gap is presentz) between these two alignments
z) between %s %s and %s %sMz$ [fragmentation of reference genome]z			  Not a misassemblyz between these two alignments
zfake: not a misassemblyzM			  Stretch of %d mismatches between these two alignments (number of Ns: %d)zindel: stretch of mismatcheszIndel (<= 5bp)zIndel (> 5bp)Z	insertionZdeletionzW			  %s between these two alignments: %s of length %d; %d mismatches (number of Ns: %d)zindel: z>			  Overlap between these two alignments (local misassembly).z:			  Gap between these two alignments (local misassembly).z Inconsistency = zlocal misassemblyz			Real Alignment %d: %sr   gffffff?)r   r   zkInternal QUAST bug: contig aligned length is greater than contig length (contig: %s, len: %d, aligned: %d)!)8r>   rh   ranger4   r   r   r   r   r@   r   r   rq   rp   r9   r:   r   large_genomer   Zstdout_fwriteZicarus_out_fr`   rS   
setdefaultZcoords_filtered_frY   r
   r&   r"   r#   r'   Zmisassembly_frA   r!   r    r   r   r   r(   r;   r<   r$   local_misassembly_min_lengthr   rd   min_alignmentrj   SHORT_INDEL_THRESHOLDrm   rk   rl   lower	strict_NAr   r   extend)1Zsorted_alignsr   aligned_lengthsZregion_misassembliesr{   Z
ref_alignsZref_featuresr   Zmisassemblies_by_refZistranslocations_by_refr   	ca_outputr   
prev_alignZcur_aligned_lengthis_misassembledZcontig_is_printedZindels_infoZcnt_misassembliesr   Zmisassembly_inforH   Z
next_alignr   internal_overlapr   Zis_extensive_misassemblyr   Zmisassembly_typeZprev_refZnext_refr   r   r   Zcontig_aligned_lengthsr   r   rw   Zscaff_gap_typemsgZmisassembly_idZ
reason_msgZ	ns_numberZnot_ns_numberZindel_lengthZindel_classZ
indel_typerj   Zcontig_aligned_lengthr)   r)   r*   process_misassembled_contig  s   
 


 





$ 





" 
B

r   )N)FNFF)N)!
__future__r   r   
quast_libsr   quast_libs.ca_utils.miscr   r   r   quast_libs.logr   LOGGER_DEFAULT_NAMEloggerquast_libs.qutilsr	   r
   objectr+   r7   rh   rx   rz   r|   r   r   r   r   r   r   r   r   r   r   r)   r)   r)   r*   <module>   s.   
4

,6
c