o
    h&H                     @   s  d Z ddlZddlZddlmZ ddlmZ ddlZddlZddl	m
Z ddlmZ ddlmZmZ ddlmZmZmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZ e  e e!Z"g dZ#ej$j%ej$j&ej$j'ej$j(ej$j)ej$j*ej$j+ej$j,ej$j-ej$j.ej$j/ej$j0edZ1e2dZ3e3dg Z4e4g d Z5dd Z6dd Z7de8de8de9de8de8f
ddZ:e!dkre; Z<e<j=de8ddd  e<j=d!d"d#d$ e<j=d%de8dd&d' e<j=d(e8d)d* e<j=d+e8d,d* e<> Z?e:e?j@e?jAe?jBe?jCe?jD dS dS )-zConvert ESM checkpoint.    N)Path)TemporaryDirectory)batch_encode_sequences)
esmfold_v1)	EsmConfigEsmFoldConfig)EsmForMaskedLMEsmForSequenceClassificationEsmIntermediateEsmLayer	EsmOutputEsmSelfAttentionEsmSelfOutput)EsmForProteinFolding)EsmTokenizer)logging))protein1\  MNGTEGPNFYVPFSNATGVVRSPFEYPQYYLAEPWQFSMLAAYMFLLIVLGFPINFLTLYVTVQHKKLRTPLNYILLNLAVADLFMVLGGFTSTLYTSLHGYFVFGPTGCNLEGFFATLGGEIALWSLVVLAIERYVVVCKPMSNFRFGENHAIMGVAFTWVMALACAAPPLAGWSRYIPEGLQCSCGIDYYTLKPEVNNESFVIYMFVVHFTIPMIIIFFCYGQLVFTVKEAAAQQQESATTQKAEKEVTRMVIIMVIAFLICWVPYASVAFYIFTHQGSNFGPIFMTIPAFFAKSAAIYNPVIYIMMNKQFRNCMLTTICCGKNPLGDDEASATVSKTETSQVAPA)protein2?MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLA)protein3zPMKTVRQERLKSI<mask>RILERSKEPVSGAQLAEELS<mask>SRQVIVQDIAYLRSLGYN<mask>VATPRGYVLAGG)protein4zNMKTVRQERLKSI<mask>RILERSKEPVSGAQLAEELS<mask>SRQVIVQDIAYLRSLGYN<mask>VATPRGYVLA)esm1b_t33_650M_UR50Sesm1v_t33_650M_UR90S_1esm1v_t33_650M_UR90S_2esm1v_t33_650M_UR90S_3esm1v_t33_650M_UR90S_4esm1v_t33_650M_UR90S_5esm2_t48_15B_UR50Desm2_t36_3B_UR50Desm2_t33_650M_UR50Desm2_t30_150M_UR50Desm2_t12_35M_UR50Desm2_t6_8M_UR50Dr   ARNDCQEGHILKMFPSTWYVX)z<pad>z<mask>z<cls>z<sep>z<eos>c                  C   s^   t  } dt}t| d }|| tt|d}W d    n1 s%w   Y  d|_|S )N
	vocab.txt
vocab_filer   )r   joinrestypes_with_extrasr   
write_textr   strpad_token_id)tempdirvocabr)   hf_tokenizer r2   Y/var/www/html/ai/venv/lib/python3.10/site-packages/transformers/models/esm/convert_esm.pyget_esmfold_tokenizerN   s   

r4   c                 C   s>   | |  }|jrtd|j |jrtd|j d S )NzMissing keys: zUnexpected keys: )load_state_dict
state_dictmissing_keys
ValueErrorunexpected_keys)original_module
our_modulestatusr2   r2   r3   transfer_and_check_weightsX   s   r=   modelpytorch_dump_folder_pathclassification_headpush_to_repo
auth_tokenc           0      C   s	  |  drt|   }nt|   \}}|  |  dr|jj}|jj}|jj}	d| }
|jj}d}d}d}t }|j	
 D ]\}}t||rP|dkrPt||| q=|j	j
 D ]\}}t|j|rl|dkrlt|j|| qW|j	jj
 D ]\}}t|jj|rt|jj|| qtn?t|dr|jj}|jj}|jj}	|jj}
|jj}|jrdnd}d	}d}d
}n|j}|j}|j}	d| }
|j}d}d}d}d
}|r|jj}t|j}|j}|j}|r|j}n|}tdDi d|jjd|d|d|d|	d|
ddddddddd|d|d|d|d|d|d|}|r)|jd jjjd  |_t d!| |  dr7t!}n|r=t"}nt#}||} |   |jj| jj$j%_|d	kr\|j&j| jj$j'_|jrp|jj| jj$j(_|jj)| jj$j(_)|j*j| jj+j*_|j*j)| jj+j*_)t,|j-D ]}| jj+j.| }|j| }|j/j0}|j1j2jj3j|j1j4jj3j  kr|j1j5jj3j  krt67|j8|j8fksJ  J |j1j4j|j9j_3|j1j4j)|j9j)_3|j1j2j|j:j_3|j1j2j)|j:j)_3|j1j5j|j;j_3|j1j5j)|j;j)_3t<|j1d"d
d
ur|j1j=j>|j?j>_3|j@j|j/jA_|j@j)|j/jA_)|jBj|jA_|jBj)|jA_)|j/jC}|jDjj|j1jjjks1J |j1jj|jD_|j1jj)|jD_)|jE}|jDjj|jFjjksOJ |jFj|jD_|jFj)|jD_)|jC}|jDjj|jGjjkskJ |jGj|jD_|jGj)|jD_)q|r|jHj3| jH_3|jIj3| jI_3tJ|jK| jK tJ|jL| jL tJ|j| j tJ|jM| jM tJ|jN| jN tJ|jO| jO tJ|jP| jP nZ|r|jjd jDj| jQjD_|jd jDj)| jQjD_)|jd jj| jQj_|jd jj)| jQj_)n-|jOjDj| jOjD_|jOjDj)| jOjD_)|jOj(j| jOj(_|jOj(j)| jOj(_)|jOj| jOjR_|jOj)| jO_)tJ|jS| jjS |r&tTd
d# }ntT}|r]tU } | d$d% |D d&ddd'}!tVd(d% |D \}"}#}$}$}$t6W|!d) |"ko[t6W|!d* |#k}%nL|X }&|&|\}'}(})tY  }*d+Z|j}+t[|*d, },|,\|+ t]t^|,d-} W d
   n	1 sw   Y  | d.d% |D d&dd/}!t6W|!d) |)k}%t d0|%rd1nd2 |%st_d3t6`  |r|a bd4d% |D }-| a |!d) a |!d* a d5}.n.| dDi |!d6di}.|.d7 }.|r|jcjd |d|)}-n||!d) tet,d8d9}-|-d7 }-|r-t6ft6g|.d: |-d:  h }/t6ji|.d: |-d: dd;}%nt6ft6g|.|- h }/t6ji|.|-dd;}%t d<|/  t d=|%rOd1nd2 |%sYt_d>|s| j|!d) |!d* }.|j|!d) }-t6ft6g|.|- h }/t6ji|.|-dd;}%t d? t d<|/  t d=|%rd1nd2 |%st_d>tk[|jlddd@ t dA|  | m| ~W d
   n	1 sw   Y  t dB|  | m| |r| jn||dC | jn||dC d
S d
S )Ez?
    Copy/paste/tweak esm's weights to our BERT structure.
    esmfold   FrotaryTtrunkstructure_moduleargsabsoluteN
vocab_sizemask_token_idhidden_sizenum_hidden_layersnum_attention_headsintermediate_sizemax_position_embeddingsi  layer_norm_epsgh㈵>attention_probs_dropout_probg        hidden_dropout_probr.   emb_layer_norm_beforetoken_dropoutposition_embedding_typeis_folding_modelesmfold_config
vocab_listmnlir   zOur ESM config:rot_emb   c                 S      g | ]}|d  qS    r2   .0rowr2   r2   r3   
<listcomp>,      z5convert_esm_checkpoint_to_pytorch.<locals>.<listcomp>pt)return_tensorspaddingadd_special_tokensc                 S   r]   r^   r2   r`   r2   r2   r3   rc   .  rd   	input_idsattention_maskr&   r'   r(   c                 S   r]   r^   r2   r`   r2   r2   r3   rc   =  rd   )rf   rg   z1Do both models tokenizers output the same tokens?u   🔥u   💩zTokenization does not match!c                 S   r]   r^   r2   r`   r2   r2   r3   rc   K  rd   )ri   rj   output_hidden_stateslogitsi  )repr_layers	positions)atolzmax_absolute_diff = z'Do both models output the same tensors?zSomething went wRoNgzContact prediction testing:)parentsexist_okzSaving model to zSaving tokenizer to )repo_idtoken_tokenr2   )o
startswithMODEL_MAPPINGevalesm	embed_dim
num_layersattention_headsrU   r   cfgitemshasattrsetattrrF   rG   rH   layersffn_embed_dimrT   alphabettupleall_toksmask_idxpadding_idxr   embed_tokensnum_embeddingsclassification_headsout_projweightshape
num_labelsprintr   r	   r   
embeddingsword_embeddingsembed_positionsposition_embeddings
layer_normbiasemb_layer_norm_afterencoderrangerM   layer	attentionself	self_attnk_projdataq_projv_projtorchSizerL   querykeyvaluegetattrr[   inv_freqrotary_embeddingsself_attn_layer_norm	LayerNormfinal_layer_normoutputdenseintermediatefc1fc2esm_s_combine
af2_to_esmr=   	embedding	esm_s_mlpdistogram_headptm_headlm_head	lddt_head
classifierdecodercontact_headSAMPLE_DATAr4   esmfold_encode_sequencesallget_batch_converterr   r*   r   r,   r   r-   	Exceptionno_gradcudainferr>   extract_featureslistmaxabsitemallclosepredict_contactspathlibmkdirsave_pretrainedpush_to_hub)0r>   r?   r@   rA   rB   rw   r   rx   ry   rN   rO   rU   rT   rV   rW   rX   r   valrY   rK   r.   original_esm_modelconfigmodel_classir   	esm_layerr   self_outputr   bert_outputsample_datar1   	hf_tokensesmfold_aasesmfold_mask_successbatch_converterbatch_labels
batch_strsbatch_tokensr/   r0   r)   their_output
our_outputmax_absolute_diffr2   r2   r3   !convert_esm_checkpoint_to_pytorch`   s  



	








 
5
r   __main__z--pytorch_dump_folder_pathTz!Path to the output PyTorch model.)typerequiredhelpz--classification_head
store_truez/Whether to convert a final classification head.)actionr   z--modelzName of model to convert.)defaultr   r   r   z--push_to_repoz(Repo to upload to (including username!).)r   r   z--auth_tokenzHuggingFace auth token.)E__doc__argparser   r   tempfiler   rw   
esm_moduler   esm.esmfold.v1.miscr   r   esm.esmfold.v1.pretrainedr   )transformers.models.esm.configuration_esmr   r   $transformers.models.esm.modeling_esmr   r	   r
   r   r   r   r   (transformers.models.esm.modeling_esmfoldr   (transformers.models.esm.tokenization_esmr   transformers.utilsr   set_verbosity_info
get_logger__name__loggerr   
pretrainedr   r   r   r   r   r   r   r   r    r!   r"   r#   ru   r   restypesrestypes_with_xr+   r4   r=   r-   boolr   ArgumentParserparseradd_argument
parse_argsrH   r>   r?   r@   rA   rB   r2   r2   r2   r3   <module>   s   $	



  #