ó
Y_Sc           @@ s¹   d  Z  d d l m Z m Z d d l Z d d l Td d l Td d l Td Z d Z	 d Z
 d Z e j d ƒ Z e j d ƒ Z e j d ƒ Z d d „ Z d d „ Z d „  Z d „  Z d S(   sò   Read and write the CLUSTAL sequence file format.
 
See :
- http://www.cmpharm.ucsf.edu/~goh/Treecorr/sampleAlignment.html
- http://www.bioperl.org/wiki/ClustalW_multiple_alignment_format
  
Ref :
-   Higgins D., Thompson J., Gibson T., Thompson J.D., Higgins D.G., Gibson 
    T.J. (1994). CLUSTAL W: improving the sensitivity of progressive multiple
    sequence alignment through sequence weighting, position-specific gap
    penalties and weight matrix choice. Nucleic Acids Res. 22:4673-4680.
i    (   t   absolute_importt   print_functionNi   (   t   *i   t   examplet   namest
   extensionst   readsÓ  
CLUSTAL W (1.81) multiple sequence alignment


CXCR3_MOUSE       --------------------------LENSTSPYDYGENESD-------FSDSPPCPQDF
BLR_HUMAN         --------------------------LENLEDLF-WELDRLD------NYNDTSLVENH-
CXCR1_HUMAN       --------------------------MSNITDPQMWDFDDLN-------FTGMPPADEDY
CXCR4_MURINE      -----------------------------------YTSDN---------YSGSGDYDSNK
                                                     :  :          :..     ..
 
CXCR3_MOUSE       -SL-------NFDRTFLPALYSLLFLLGLLGNGAVAAVLLSQRTALSSTDTFLLHLAVAD
BLR_HUMAN         --LC-PATMASFKAVFVPVAYSLIFLLGVIGNVLVLVILERHRQTRSSTETFLFHLAVAD
CXCR1_HUMAN       -SPC-MLETETLNKYVVIIAYALVFLLSLLGNSLVMLVILYSRVGRSVTDVYLLNLALAD
CXCR4_MURINE      -EPC-RDENVHFNRIFLPTIYFIIFLTGIVGNGLVILVMGYQKKLRSMTDKYRLHLSVAD
                             :.  .:   * ::** .::**  *  ::   :   * *: : ::*::**

CXCR3_MOUSE       VLLVLTLPLWAVDAA-VQWVFGPGLCKVAGALFNINFYAGAFLLACISFDRYLSIVHATQ
BLR_HUMAN         LLLVFILPFAVAEGS-VGWVLGTFLCKTVIALHKVNFYCSSLLLACIAVDRYLAIVHAVH
CXCR1_HUMAN       LLFALTLPIWAASKV-NGWIFGTFLCKVVSLLKEVNFYSGILLLACISVDRYLAIVHATR
CXCR4_MURINE      LLFVITLPFWAVDAM-ADWYFGKFLCKAVHIIYTVNLYSSVLILAFISLDRYLAIVHATN
                  :*:.: **: ...     * :*  ***..  :  :*:*.. ::** *:.****:****..
t   clustalt   clustalwt   alns   (CLUSTAL.*)$s   (\s*\S+\s+)(\S+)\s*(\d*)\s*$s   ([\s:\.\*]*)$c         C@ s   t  t |  | ƒ ƒ S(   s'   Iterate over the sequences in the file.(   t   iterR   (   t   fint   alphabet(    (    sC   /home/psgendb/BIRCHDEV/pkg/weblogo-3.4/corebio/seq_io/clustal_io.pyt   iterseq[   s    c   	      C@ s‡  t  | ƒ } g  } g  } d } d } xt |  ƒ D]} | j d k rO d } q1 | j d k r“ t | ƒ | k r9| j | j ƒ | j g  ƒ q9q1 | j d k r1 | j | j ƒ sÖ t d | j | | j f ƒ ‚ n  | | j | j ƒ | d k rt | j ƒ } n$ | t | j ƒ k r,t d ƒ ‚ n  | d 7} q1 q1 Wg  t	 | | ƒ D]* \ } } t
 d j | ƒ | d	 | ƒ^ qM} t | ƒ S(
   Ni    t   begin_blockt   seq_idt   seqs.   Character on line: %d not in alphabet: %s : %ss   Inconsistent line lengthsi   t    t   name(   t   Alphabett   _scant   typeoft   lent   appendt   datat
   alphabetict
   ValueErrort   linenot   zipt   Seqt   joint   SeqList(	   R   R   t   seq_idst   seqst   block_countt   data_lent   tokent   st   i(    (    sC   /home/psgendb/BIRCHDEV/pkg/weblogo-3.4/corebio/seq_io/clustal_io.pyR   a   s2    	@c   	      c@ s   t  d ƒ \ } } } t d ƒ Vd } | } xÈt |  ƒ D]º\ } } | | k r¡ | j ƒ  rc q9 n  t j | ƒ } | } | d k	 r¡ t d | j ƒ  ƒ Vq9 q¡ n  | | k rÓ | j ƒ  r¿ q9 n  t d ƒ V| } n  | | k rê| j ƒ  rt d ƒ V| } q9 n  t j | ƒ } | d k	 r8t d | | d !ƒ Vq9 n  t	 j | ƒ } | d k rlt
 d | | f ƒ ‚ n  t | j d	 ƒ ƒ } t d
 | j d	 ƒ j ƒ  ƒ Vt d | j d ƒ j ƒ  ƒ V| j d ƒ r9 t d | j d ƒ ƒ Vq9 q9 n  t ƒ  ‚ q9 W| | k rt d ƒ Vn  t d ƒ Vd S(   sŽ  Scan a clustal format MSA file and yield tokens.
        The basic file structure is
            begin_document
                header?     
               (begin_block
                   (seq_id seq seq_index?)+
                   match_line?
               end_block)*
            end_document     
    
        Usage:
        for token in scan(clustal_file):
            do_something(token)
    i   t   beginiÿÿÿÿt   headerR   t	   end_blockt
   match_lines   Parse error on line: %d (%s)i   R   R   i   t   seq_numt   endN(   t   ranget   Tokent	   enumeratet   isspacet   header_linet   matcht   Nonet   groupR*   t   seq_lineR   R   t   stript   RuntimeError(	   R   R(   t   bodyt   blockt   leader_widtht   statet   Lt   linet   m(    (    sC   /home/psgendb/BIRCHDEV/pkg/weblogo-3.4/corebio/seq_io/clustal_io.pyR   ˆ   sR     	 	c   
      C@ sä   d } d } d } t  | d d d |  ƒt d „  | Dƒ ƒ } xŸ t d | | ƒ D]‹ } xu | D]m } t | t | ƒ ƒ } t | | t | ƒ ƒ }	 t  | j j | ƒ d d	 d |  ƒt  | | |	 !d |  ƒq^ Wt  d |  ƒ qQ Wd
 S(   s0   Write 'seqs' to 'fout' as text in clustal formats,   CLUSTAL W (1.81) multiple sequence alignmenti   i<   R,   s   

t   filec         s@ s   |  ] } t  | ƒ Vq d  S(   N(   R   (   t   .0R%   (    (    sC   /home/psgendb/BIRCHDEV/pkg/weblogo-3.4/corebio/seq_io/clustal_io.pys	   <genexpr>Õ   s    i    R   N(   t   printt   maxR-   t   minR   R   t   ljust(
   t   foutR!   R(   t
   name_widtht	   seq_widthR<   R9   R%   t   startR,   (    (    sC   /home/psgendb/BIRCHDEV/pkg/weblogo-3.4/corebio/seq_io/clustal_io.pyt   writeÎ   s    "(   s   examples   namess
   extensionss   read(   R   R   (   R	   (   t   __doc__t
   __future__R    R   t   ret   utilsR   R   t   __all__R   R   R   t   compileR1   R5   R*   R3   R   R   R   RI   (    (    (    sC   /home/psgendb/BIRCHDEV/pkg/weblogo-3.4/corebio/seq_io/clustal_io.pyt   <module>%   s    


'	F