Determine which receptor arms to define clones by

Is your feature request related to a problem?

I'm working on a plasma cell pathology where we expect lots of the clones to have orphan VJ chains, and we'd like to use BCRseq to trace expanded clones. Unfortunately, Dandelion misses these because they don't have a functional BCR.

Describe the solution you'd like

I'd like another parameter for define_clones, e.g. receptor_arms="both", which can take values "VJ", "VDJ", "both" (the latter being the default).

I'm still trying to parse through the source of tl.define_clones but I think all the necessary parts should lie there.

Describe alternatives you've considered

I tried removing all VDJ chains from the Dandelion object but that fails.
I tried clustering VJ chains from vdjx.data based on changeo_clone_id but they don't get clone_id assigned.

Additional context

No response

Hi @Ngort, happy to support.

Just want to clarify a few things:

I suppose you mean tl.find_clones? as define_clones just points to changeo's DefineClones.py

at the moment there is a single check at

dandelion/dandelion/tools/_tools.py

Line 157 in 159d716

if dat_vdj.shape[0] > 0:

which basically would assert that the .data must contain VDJ, otherwise it would not perform the clustering.

I'm thinking i can can add an ifelse at the end of this entire chunk, and "duplicate" this section (as a separate function to reduce code bloat):

dandelion/dandelion/tools/_tools.py

Lines 346 to 590 in 159d716

    
           dat_vj_c = dat[dat["locus"].isin(locus_2)].copy() 
        
           if dat_vj_c.shape[0] != 0: 
        
               if not by_alleles: 
        
                   if "v_call_genotyped" in dat_vj_c.columns: 
        
                       Vvj = [ 
        
                           re.sub("[*][0-9][0-9]", "", str(v)) 
        
                           for v in dat_vj_c["v_call_genotyped"] 
        
                       ] 
        
                   else: 
        
                       Vvj = [ 
        
                           re.sub("[*][0-9][0-9]", "", str(v)) 
        
                           for v in dat_vj_c["v_call"] 
        
                       ] 
        
                   Jvj = [ 
        
                       re.sub("[*][0-9][0-9]", "", str(j)) 
        
                       for j in dat_vj_c["j_call"] 
        
                   ] 
        
               else: 
        
                   if "v_call_genotyped" in dat_vj_c.columns: 
        
                       Vvj = [str(v) for v in dat_vj_c["v_call_genotyped"]] 
        
                   else: 
        
                       Vvj = [str(v) for v in dat_vj_c["v_call"]] 
        
                   Jvj = [str(j) for j in dat_vj_c["j_call"]] 
        
               # collapse the alleles to just genes 
        
               Vvj = [",".join(list(set(v.split(",")))) for v in Vvj] 
        
               Jvj = [",".join(list(set(j.split(",")))) for j in Jvj] 
        
               seq = dict(zip(dat_vj_c.index, dat_vj_c[key_])) 
        
               if recalculate_length: 
        
                   seq_length = [len(str(l)) for l in dat_vj_c[key_]] 
        
               else: 
        
                   try: 
        
                       seq_length = [ 
        
                           len(str(l)) for l in dat_vj_c[key_ + "_length"] 
        
                       ] 
        
                   except: 
        
                       raise ValueError( 
        
                           "{} not found in {} input table.".format( 
        
                               key_ + "_length", locus_log2_dict[locusx] 
        
                           ) 
        
                       ) 
        
               seq_length_dict = dict(zip(dat_vj_c.index, seq_length)) 
        
               # Create a dictionary and group sequence ids with same V and J genes 
        
               V_Jvj = dict(zip(dat_vj_c.index, zip(Vvj, Jvj))) 
        
               vj_lightgrp = defaultdict(list) 
        
               for key, val in sorted(V_Jvj.items()): 
        
                   vj_lightgrp[val].append(key) 
        
               # and now we split the groups based on lengths of the seqs 
        
               vj_len_lightgrp = Tree() 
        
               seq_lightgrp = Tree() 
        
               for g in vj_lightgrp: 
        
                   # first, obtain what's the unique lengths 
        
                   jlen = [] 
        
                   for contig_id in vj_lightgrp[g]: 
        
                       jlen.append(seq_length_dict[contig_id]) 
        
                   setjlen = list(set(jlen)) 
        
                   # then for each unique length, we add the contigs to a new tree if it matches the length 
        
                   # and also the actual seq sequences into a another one 
        
                   for s in setjlen: 
        
                       for contig_id in vj_lightgrp[g]: 
        
                           jlen_ = seq_length_dict[contig_id] 
        
                           if jlen_ == s: 
        
                               vj_len_lightgrp[g][s][contig_id].value = 1 
        
                               seq_lightgrp[g][s][seq[contig_id]].value = 1 
        
                               for c in [contig_id]: 
        
                                   vj_len_lightgrp[g][s][c] = seq[c] 
        
               clones_light = Tree() 
        
               for g in seq_lightgrp: 
        
                   for l in seq_lightgrp[g]: 
        
                       seq_ = list(seq_lightgrp[g][l]) 
        
                       tdarray = np.array(seq_).reshape(-1, 1) 
        
                       d_mat = squareform( 
        
                           pdist(tdarray, lambda x, y: hamming(x[0], y[0])) 
        
                       ) 
        
                       # then calculate what the acceptable threshold is for each length of sequence 
        
                       tr = math.floor(int(l) * (1 - identity_)) 
        
                       d_mat = np.tril(d_mat) 
        
                       np.fill_diagonal(d_mat, 0) 
        
                       # convert diagonal and upper triangle to zeroes 
        
                       indices_temp = [] 
        
                       indices = [] 
        
                       indices_temp = [ 
        
                           list(x) for x in np.tril_indices_from(d_mat) 
        
                       ] 
        
                       # get the coordinates/indices of seqs to match against the threshold later 
        
                       indices = list( 
        
                           zip(indices_temp[0], indices_temp[1]) 
        
                       ) 
        
                       # if there's more than 1 contig, remove the diagonal 
        
                       if len(indices) > 1: 
        
                           for pairs in indices: 
        
                               if pairs[0] == pairs[1]: 
        
                                   indices.remove(pairs) 
        
                       indices_j = [] 
        
                       # use the coordinates/indices to retrieve the seq sequences 
        
                       for p in range(0, len(indices)): 
        
                           a1, b1 = indices[p] 
        
                           indices_j.append(seq_[a1]) 
        
                           indices_j.append(seq_[b1]) 
        
                       # retain only the unique sequences 
        
                       indices_j_f = list(set(indices_j)) 
        
                       # convert the distance matrix to coordinate (source) and distance (target) 
        
                       # and create it as a dictionary 
        
                       source, target = d_mat.nonzero() 
        
                       source_target = list( 
        
                           zip(source.tolist(), target.tolist()) 
        
                       ) 
        
                       if len(source) == 0 & len(target) == 0: 
        
                           source_target = list([(0, 0)]) 
        
                       dist = {} 
        
                       for st in source_target: 
        
                           dist.update({st: d_mat[st]}) 
        
                       if d_mat.shape[0] > 1: 
        
                           seq_tmp_dict_l = clustering(dist, tr, seq_) 
        
                       else: 
        
                           seq_tmp_dict_l = {seq_[0]: tuple([seq_[0]])} 
        
                       # sort the list so that clones that are larger have a smaller number 
        
                       clones_tmp_l = sorted( 
        
                           list(set(seq_tmp_dict_l.values())), 
        
                           key=len, 
        
                           reverse=True, 
        
                       ) 
        
                       for x in range(0, len(clones_tmp_l)): 
        
                           clones_light[g][l][x + 1] = clones_tmp_l[x] 
        
               clone_dict_light = {} 
        
               # now to retrieve the contig ids that are grouped together 
        
               cid_light = Tree() 
        
               for g in clones_light: 
        
                   for l in clones_light[g]: 
        
                       # retrieve the clone 'numbers' 
        
                       for c in clones_light[g][l]: 
        
                           grp_seq = clones_light[g][l][c] 
        
                           for key, value in vj_len_lightgrp[g][l].items(): 
        
                               if value in grp_seq: 
        
                                   cid_light[g][l][c][key].value = 1 
        
               # rename clone ids - get dictionaries step by step 
        
               first_key = [] 
        
               for k1 in cid_light.keys(): 
        
                   first_key.append(k1) 
        
               first_key = list(set(first_key)) 
        
               first_key_dict = dict( 
        
                   zip(first_key, range(1, len(first_key) + 1)) 
        
               ) 
        
               # and now for the middle key 
        
               for g in cid_light: 
        
                   second_key = [] 
        
                   for k2 in cid_light[g].keys(): 
        
                       second_key.append(k2) 
        
                   second_key = list(set(second_key)) 
        
                   second_key_dict = dict( 
        
                       zip(second_key, range(1, len(second_key) + 1)) 
        
                   ) 
        
                   for l in cid_light[g]: 
        
                       # and now for the last key 
        
                       third_key = [] 
        
                       for k3 in cid_light[g][l].keys(): 
        
                           third_key.append(k3) 
        
                       third_key = list(set(third_key)) 
        
                       third_key_dict = dict( 
        
                           zip(third_key, range(1, len(third_key) + 1)) 
        
                       ) 
        
                       for key, value in dict(cid_light[g][l]).items(): 
        
                           vL = [] 
        
                           for v in value: 
        
                               if type(v) is int: 
        
                                   break 
        
                               # instead of converting to another tree, i will just make it a dictionary 
        
                               clone_dict_light[v] = ( 
        
                                   str(first_key_dict[g]) 
        
                                   + "_" 
        
                                   + str(second_key_dict[l]) 
        
                                   + "_" 
        
                                   + str(third_key_dict[key]) 
        
                               ) 
        
               lclones = list(clone_dict_light.values()) 
        
               renamed_clone_dict_light = {} 
        
               if collapse_label: 
        
                   # will just update the main dat directly 
        
                   if len(list(set(lclones))) > 1: 
        
                       lclones_dict = dict( 
        
                           zip( 
        
                               sorted(list(set(lclones))), 
        
                               [ 
        
                                   str(x) 
        
                                   for x in range( 
        
                                       1, len(list(set(lclones))) + 1 
        
                                   ) 
        
                               ], 
        
                           ) 
        
                       ) 
        
                   else: 
        
                       lclones_dict = dict( 
        
                           zip(sorted(list(set(lclones))), str(1)) 
        
                       ) 
        
                   for key, value in clone_dict_light.items(): 
        
                       renamed_clone_dict_light[key] = lclones_dict[value] 
        
               else: 
        
                   for key, value in clone_dict_light.items(): 
        
                       renamed_clone_dict_light[key] = value 
        
               cellclonetree = Tree() 
        
               seqcellclonetree = Tree() 
        
               for c, s, z in zip( 
        
                   dat["cell_id"], dat["sequence_id"], dat[clone_key] 
        
               ): 
        
                   seqcellclonetree[c][s].value = 1 
        
                   if pd.notnull(z): 
        
                       cellclonetree[c][z].value = 1 
        
               for c in cellclonetree: 
        
                   cellclonetree[c] = list(cellclonetree[c]) 
        
               fintree = Tree() 
        
               for c in tqdm( 
        
                   cellclonetree, 
        
                   desc="Refining clone assignment based on VJ chain pairing ", 
        
                   bar_format="{l_bar}{bar:10}{r_bar}{bar:-10b}", 
        
                   disable=not verbose, 
        
               ): 
        
                   suffix = [ 
        
                       renamed_clone_dict_light[x] 
        
                       for x in seqcellclonetree[c] 
        
                       if x in renamed_clone_dict_light 
        
                   ] 
        
                   fintree[c] = [] 
        
                   if len(suffix) > 1: 
        
                       for cl in cellclonetree[c]: 
        
                           if present(cl): 
        
                               for s in suffix: 
        
                                   fintree[c].append(cl + "_" + s) 
        
                       # fintree[c] = fintree[c] 
        
                   else: 
        
                       for cl in cellclonetree[c]: 
        
                           if present(cl): 
        
                               if len(suffix) > 0: 
        
                                   fintree[c].append( 
        
                                       cl + "_" + "".join(suffix) 
        
                                   ) 
        
                               else: 
        
                                   fintree[c].append(cl) 
        
                   fintree[c] = "|".join(fintree[c]) 
        
               dat[clone_key] = [fintree[x] for x in dat["cell_id"]] 
        
           dat_[clone_key].update(pd.Series(dat[clone_key]))

So then i envisage that the VJ clone_id in this mode would just look like NoVDJ_1_2_3 and vice versa in the VDJ mode, it would be 1_2_3_NoVJ or something of the like.

Does this look like something you are after?

I think that would fit!

OK ! please bear with me as i try to implement it. Do you have a deadline you need this by?

I'm trying to submit an abstract by Sunday, but obviously I don't mean to rush your generous work! In the meantime, do you think there's a faster makeshift way for me to access the light chain clusters presumably generated during the tl.find_clones workflow?

Sorry, I stand corrected. I did mean to point to define_clones, the wrapper of DefineClones.py, although the same statement can be made about find_clones. My understanding is that DefineClones.py simply aligns sequences of any locus, regardless of whether they're paired to each other in a barcoded cell, and define_clones creates the cell clusters (clone_id). Thus, I'd assume there's a way to use changeo's wrapper with just VJ data. Is this accurate?

Based on the documentation:

Clone definition is based on the following criterion:

Identical V- and J-gene usage in the VDJ chain (IGH/TRB/TRD).

Identical CDR3 junctional/CDR3 sequence length in the VDJ chain.

VDJ chain junctional/CDR3 sequences attains a minimum of % sequence similarity, based on hamming distance. The similarity cut-off is tunable (default is 85%; change to 100% if analyzing TCR data).

VJ chain (IGK/IGL/TRA/TRG) usage. If cells within clones use different VJ chains, the clone will be splitted following the same conditions for VDJ chains in (1-3) as above.

I guess I could try giving all the barcodes the same VDJ data and see what happens?

Ah i’m not sure… probably? If you change back to the previous release, the code based for define_clones is still there and you could try and run it.

Sorry as this code is implemented by the changeo people i can only offer limited assistance (don’t really know what it does)

dandelion's way of definining clonotypes for orphan VJ/VDJ has been implemented. see #329

	dat_vj_c = dat[dat["locus"].isin(locus_2)].copy()
	if dat_vj_c.shape[0] != 0:
	if not by_alleles:
	if "v_call_genotyped" in dat_vj_c.columns:
	Vvj = [
	re.sub("[*][0-9][0-9]", "", str(v))
	for v in dat_vj_c["v_call_genotyped"]
	]
	else:
	Vvj = [
	re.sub("[*][0-9][0-9]", "", str(v))
	for v in dat_vj_c["v_call"]
	]
	Jvj = [
	re.sub("[*][0-9][0-9]", "", str(j))
	for j in dat_vj_c["j_call"]
	]
	else:
	if "v_call_genotyped" in dat_vj_c.columns:
	Vvj = [str(v) for v in dat_vj_c["v_call_genotyped"]]
	else:
	Vvj = [str(v) for v in dat_vj_c["v_call"]]
	Jvj = [str(j) for j in dat_vj_c["j_call"]]
	# collapse the alleles to just genes
	Vvj = [",".join(list(set(v.split(",")))) for v in Vvj]
	Jvj = [",".join(list(set(j.split(",")))) for j in Jvj]
	seq = dict(zip(dat_vj_c.index, dat_vj_c[key_]))
	if recalculate_length:
	seq_length = [len(str(l)) for l in dat_vj_c[key_]]
	else:
	try:
	seq_length = [
	len(str(l)) for l in dat_vj_c[key_ + "_length"]
	]
	except:
	raise ValueError(
	"{} not found in {} input table.".format(
	key_ + "_length", locus_log2_dict[locusx]
	)
	)
	seq_length_dict = dict(zip(dat_vj_c.index, seq_length))
	# Create a dictionary and group sequence ids with same V and J genes
	V_Jvj = dict(zip(dat_vj_c.index, zip(Vvj, Jvj)))
	vj_lightgrp = defaultdict(list)
	for key, val in sorted(V_Jvj.items()):
	vj_lightgrp[val].append(key)
	# and now we split the groups based on lengths of the seqs
	vj_len_lightgrp = Tree()
	seq_lightgrp = Tree()
	for g in vj_lightgrp:
	# first, obtain what's the unique lengths
	jlen = []
	for contig_id in vj_lightgrp[g]:
	jlen.append(seq_length_dict[contig_id])
	setjlen = list(set(jlen))
	# then for each unique length, we add the contigs to a new tree if it matches the length
	# and also the actual seq sequences into a another one
	for s in setjlen:
	for contig_id in vj_lightgrp[g]:
	jlen_ = seq_length_dict[contig_id]
	if jlen_ == s:
	vj_len_lightgrp[g][s][contig_id].value = 1
	seq_lightgrp[g][s][seq[contig_id]].value = 1
	for c in [contig_id]:
	vj_len_lightgrp[g][s][c] = seq[c]
	clones_light = Tree()
	for g in seq_lightgrp:
	for l in seq_lightgrp[g]:
	seq_ = list(seq_lightgrp[g][l])
	tdarray = np.array(seq_).reshape(-1, 1)
	d_mat = squareform(
	pdist(tdarray, lambda x, y: hamming(x[0], y[0]))
	)
	# then calculate what the acceptable threshold is for each length of sequence
	tr = math.floor(int(l) * (1 - identity_))
	d_mat = np.tril(d_mat)
	np.fill_diagonal(d_mat, 0)
	# convert diagonal and upper triangle to zeroes
	indices_temp = []
	indices = []
	indices_temp = [
	list(x) for x in np.tril_indices_from(d_mat)
	]
	# get the coordinates/indices of seqs to match against the threshold later
	indices = list(
	zip(indices_temp[0], indices_temp[1])
	)
	# if there's more than 1 contig, remove the diagonal
	if len(indices) > 1:
	for pairs in indices:
	if pairs[0] == pairs[1]:
	indices.remove(pairs)
	indices_j = []
	# use the coordinates/indices to retrieve the seq sequences
	for p in range(0, len(indices)):
	a1, b1 = indices[p]
	indices_j.append(seq_[a1])
	indices_j.append(seq_[b1])
	# retain only the unique sequences
	indices_j_f = list(set(indices_j))
	# convert the distance matrix to coordinate (source) and distance (target)
	# and create it as a dictionary
	source, target = d_mat.nonzero()
	source_target = list(
	zip(source.tolist(), target.tolist())
	)
	if len(source) == 0 & len(target) == 0:
	source_target = list([(0, 0)])
	dist = {}
	for st in source_target:
	dist.update({st: d_mat[st]})

	if d_mat.shape[0] > 1:
	seq_tmp_dict_l = clustering(dist, tr, seq_)
	else:
	seq_tmp_dict_l = {seq_[0]: tuple([seq_[0]])}
	# sort the list so that clones that are larger have a smaller number
	clones_tmp_l = sorted(
	list(set(seq_tmp_dict_l.values())),
	key=len,
	reverse=True,
	)
	for x in range(0, len(clones_tmp_l)):
	clones_light[g][l][x + 1] = clones_tmp_l[x]

	clone_dict_light = {}
	# now to retrieve the contig ids that are grouped together
	cid_light = Tree()
	for g in clones_light:
	for l in clones_light[g]:
	# retrieve the clone 'numbers'
	for c in clones_light[g][l]:
	grp_seq = clones_light[g][l][c]
	for key, value in vj_len_lightgrp[g][l].items():
	if value in grp_seq:
	cid_light[g][l][c][key].value = 1
	# rename clone ids - get dictionaries step by step
	first_key = []
	for k1 in cid_light.keys():
	first_key.append(k1)
	first_key = list(set(first_key))
	first_key_dict = dict(
	zip(first_key, range(1, len(first_key) + 1))
	)
	# and now for the middle key
	for g in cid_light:
	second_key = []
	for k2 in cid_light[g].keys():
	second_key.append(k2)
	second_key = list(set(second_key))
	second_key_dict = dict(
	zip(second_key, range(1, len(second_key) + 1))
	)
	for l in cid_light[g]:
	# and now for the last key
	third_key = []
	for k3 in cid_light[g][l].keys():
	third_key.append(k3)
	third_key = list(set(third_key))
	third_key_dict = dict(
	zip(third_key, range(1, len(third_key) + 1))
	)
	for key, value in dict(cid_light[g][l]).items():
	vL = []
	for v in value:
	if type(v) is int:
	break
	# instead of converting to another tree, i will just make it a dictionary
	clone_dict_light[v] = (
	str(first_key_dict[g])
	+ "_"
	+ str(second_key_dict[l])
	+ "_"
	+ str(third_key_dict[key])
	)
	lclones = list(clone_dict_light.values())
	renamed_clone_dict_light = {}
	if collapse_label:
	# will just update the main dat directly
	if len(list(set(lclones))) > 1:
	lclones_dict = dict(
	zip(
	sorted(list(set(lclones))),
	[
	str(x)
	for x in range(
	1, len(list(set(lclones))) + 1
	)
	],
	)
	)
	else:
	lclones_dict = dict(
	zip(sorted(list(set(lclones))), str(1))
	)
	for key, value in clone_dict_light.items():
	renamed_clone_dict_light[key] = lclones_dict[value]
	else:
	for key, value in clone_dict_light.items():
	renamed_clone_dict_light[key] = value

	cellclonetree = Tree()
	seqcellclonetree = Tree()
	for c, s, z in zip(
	dat["cell_id"], dat["sequence_id"], dat[clone_key]
	):
	seqcellclonetree[c][s].value = 1
	if pd.notnull(z):
	cellclonetree[c][z].value = 1

	for c in cellclonetree:
	cellclonetree[c] = list(cellclonetree[c])

	fintree = Tree()
	for c in tqdm(
	cellclonetree,
	desc="Refining clone assignment based on VJ chain pairing ",
	bar_format="{l_bar}{bar:10}{r_bar}{bar:-10b}",
	disable=not verbose,
	):
	suffix = [
	renamed_clone_dict_light[x]
	for x in seqcellclonetree[c]
	if x in renamed_clone_dict_light
	]
	fintree[c] = []
	if len(suffix) > 1:
	for cl in cellclonetree[c]:
	if present(cl):
	for s in suffix:
	fintree[c].append(cl + "_" + s)
	# fintree[c] = fintree[c]
	else:
	for cl in cellclonetree[c]:
	if present(cl):
	if len(suffix) > 0:
	fintree[c].append(
	cl + "_" + "".join(suffix)
	)
	else:
	fintree[c].append(cl)
	fintree[c] = "\|".join(fintree[c])
	dat[clone_key] = [fintree[x] for x in dat["cell_id"]]

	dat_[clone_key].update(pd.Series(dat[clone_key]))