a
    £žb‘   ã                   @   sÌ   d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	 e	ej
ƒZe dej¡Ze dej¡Ze dej¡Ze dej¡Zdd	„ Zd
d„ Zdd„ Zdd„ Zdd„ ZG dd„ deƒZG dd„ dƒZdS )é    N)ÚqutilsÚqconfig)Úopen_gzipsafe)Ú
get_loggerzC(?P<seqname>\S+)\s+(?P<gene_id>.+)\s+(?P<start>\d+)\s+(?P<end>\d+)$zƒ(?P<seqname>\S+)\s+\S+\s+(?P<feature>\S+)\s+(?P<start>\d+)\s+(?P<end>\d+)\s+\S+\s+(?P<strand>[\+\-\.]?)\s+\S+\s+(?P<attributes>.+)$z%(?P<number>\d+)\.\s*(?P<name>\S+)\s*$c                 C   s4  | rt j | ¡sg S t| dƒ}g }| ¡  ¡ }|dks@| d¡rN| ¡  ¡ }q.| d¡ |  d¡sl|  d¡rvt	|ƒ}n²t
 |¡sŠt |¡r”t|ƒ}n”t |¡rªt||ƒ}n~t |¡rzt|ƒ}W nB ty   t ¡ \}}}t d| ¡ t | d ¡ g }Y n0 n t d	| d
 ¡ t | d ¡ | ¡  |S )NÚrÚ ú#r   Úbedzbed.gzzParsing exception z was skippedzIncorrect format of zA's file! GFF, NCBI and the plain TXT format accepted. See manual.)ÚosÚpathÚexistsr   ÚreadlineÚrstripÚ
startswithÚseekÚendswithÚ	parse_bedÚtxt_pattern_giÚmatchÚtxt_patternÚ	parse_txtÚgff_patternÚ	parse_gffÚncbi_start_patternÚ
parse_ncbiÚParseExceptionÚsysÚexc_infoÚloggerÚwarningÚclose)ÚfpathÚfeatureZ
genes_fileÚgenesÚlineÚexc_typeÚ	exc_valueÚ_© r(   úA/home/psgendb/BIRCHDEV/pkg/quast-5.2.0/quast_libs/genes_parser.pyÚget_genes_from_file   s4    




r*   c                 C   s@  t  dt j¡}t  dt j¡}t  dt j¡}g }|  ¡ }|dkr<| ¡ dksV| d¡rj|dkr`qj|  ¡ }q@t | ¡ ¡}|sŒt | ¡ ¡}qxtt	| 
d¡ƒt | 
d¡¡d}g }|  ¡ }|dkrèt | ¡ ¡sè| | ¡ ¡ |  ¡ }qº|D ],}	|	 d	¡rt  ||	¡}|r| 
d
¡|_|	 d¡rÎt  ||	¡}|r¬| 
d¡|_t	| 
d¡ƒ|_t	| 
d¡ƒ|_dt|jƒ }
|jrÎ|j |
¡rÎ|jt|
ƒd … |_|j d¡ n"t dt|jƒ d |j d ¡ |	 d¡rìt  ||	¡}|rø| 
d¡|_qìt dt|jƒ d |j d ¡ qì|jd ur6|jd ur6| |¡ q6|S )NzMAnnotation: (?P<seqname>.+) \((?P<start>\d+)\.\.(?P<end>\d+)(, complement)?\)z Chromosome: (?P<chromosome>\S+);zID: (?P<id>\d+)r   z##ÚnumberÚname)r+   r,   zChromosome:Ú
chromosomezAnnotation:ÚseqnameÚstartÚendzChromosome z ,zWrong NCBI annotation for gene z. z. Skipping this gene.zID:Úidz.Can't parse gene's ID in NCBI format. Gene is z. Skipping it.)ÚreÚcompileÚIr   r   r   r   r   ÚGeneÚintÚgroupr   Úcorrect_nameÚappendr-   r.   r/   r0   ÚstrÚlenÚlstripr   r   r+   r,   r1   )Z	ncbi_fileZannotation_patternZchromosome_patternZ
id_patternr#   r$   ÚmÚgeneZthe_rest_linesZ	info_lineZto_trimr(   r(   r)   r   L   sZ    

ÿ

"
$r   c                 C   s    g }d}| D ]Ž}|  ¡ }t |¡p*t |¡}|rt|t | d¡¡d}|d7 }t| d¡ƒ}t| d¡ƒ}t	||ƒ|_
t||ƒ|_| d¡|_| |¡ q|S )Nr   r.   ©r+   r.   é   r/   r0   Zgene_id)r   r   r   r   r5   r   r8   r7   r6   Úminr/   Úmaxr0   r1   r9   )Úfiler#   r+   r$   r=   r>   ÚsÚer(   r(   r)   r      s"    ÿr   c                 C   s  g }d}| D  ]þ}t  | ¡ ¡}|r|tjksD| d¡ ¡ | ¡ krtt 	| d¡¡t
| d¡ƒt
| d¡ƒd}| d¡ d¡}|D ]l}|r†|d	kr†d
|v r†| d
¡d }	|t|	ƒd d … }
|	 ¡ dkrÒ|
|_|	 ¡ dkrä|
|_|
|j|	 ¡ < q†||_|d7 }| |¡ q|S )Nr   r"   r.   r/   r0   )r.   r/   r0   Ú
attributesú;r   ú=r@   r1   r,   )r   r   r   r   ÚALL_FEATURES_TYPEr7   Úlowerr5   r   r8   r6   Úsplitr;   r1   r,   rF   r+   r9   )rC   r"   r#   r+   r$   r=   r>   rF   ÚattrÚkeyÚvalr(   r(   r)   r   ¬   s.    
$þr   c           	      C   s²   g }d}| D ] }|  ¡  ¡ }|r|d }t|d ƒ}t|d ƒ}t|t |¡d}t||ƒ|_t||ƒ|_	t
|ƒdkr~|d nd |_||k r”d|_nd|_|d7 }| |¡ q|S )Nr   r@   é   r?   é   ú+ú-)r   rK   r6   r5   r   r8   rA   r/   rB   r0   r;   r1   Ústrandr9   )	rC   r#   r+   r$   Úfsr.   rD   rE   r>   r(   r(   r)   r   Ë   s$    r   c                       s$   e Zd Z‡ fdd„Zdd„ Z‡  ZS )r   c                    s    t t| ƒj|i |¤Ž || _d S ©N)Úsuperr   Ú__init__Úvalue)ÚselfrX   ÚargsÚkwargs©Ú	__class__r(   r)   rW   æ   s    zParseException.__init__c                 C   s
   t | jƒS rU   )ÚreprrX   )rY   r(   r(   r)   Ú__str__é   s    zParseException.__str__)Ú__name__Ú
__module__Ú__qualname__rW   r_   Ú__classcell__r(   r(   r\   r)   r   å   s   r   c                   @   s   e Zd Zddd„ZdS )r5   r   Nc                 C   sT   || _ || _|| _|| _|| _|| _|| _|| _|	| _|
| _	|| _
tƒ | _|| _d S rU   )r1   r.   r/   r0   r+   r,   r-   ÚcontigrS   ÚseqÚproteinÚdictrF   Úis_full)rY   r1   r.   r/   r0   r+   r,   r-   rd   rS   re   rf   rh   r(   r(   r)   rW   î   s    zGene.__init__)r   r   NNNr   NNNNNN)r`   ra   rb   rW   r(   r(   r(   r)   r5   í   s     ÿr5   )r
   r2   r   Ú
quast_libsr   r   Zquast_libs.ca_utils.miscr   Úquast_libs.logr   ÚLOGGER_DEFAULT_NAMEr   r3   r4   r   r   r   r   r*   r   r   r   r   Ú	Exceptionr   r5   r(   r(   r(   r)   Ú<module>   s"   
1C