Core Requirements: 1. Use concepts from 1-200 tokens per concept, with an average around 17 (flexible here depending on Domain) 2. Use 768 D input and output so we can take advantage of GTR-T5/vec2text 3. Use sentence to vector technology from GTR-T5 for sentence to vector in 768D and use vec2text (jxe or ielab) from Vector to Sentence in 768D 4. Use dynamic cloud text -> Vector -> Text database that is dynamic and external to the model. 5. Prefer trainable prototype using MacBook Pro M4 40 GPUs 128 G unified memory using MPS 6. Testing using triplets: Original Question Pipeline 1 : Text → GTR-T5 → 768D → vec2text → Text Next Sentence generative Pipeline 2 : Text → GTR-T5 → 768D → Mamba → 768D → vec2text → Text 7. Indicate if Start and Stop vectors are needed in pre-Training and Inference. 8. Make sure the system is designed to work well on Training and Inference. You pick the Model, and layout the Model architecture!
Edit
I'll design a clean architecture that meets your core requirements while addressing the issues from the PRD review.
Proposed Architecture: Continuous Concept Transformer (CCT)
Model Choice Rationale
Instead of forcing Mamba into continuous space, I recommend a modified Transformer architecture specifically designed for continuous vectors. This gives us:
Architecture Overview
python
class ContinuousConceptTransformer(nn.Module):
def __init__(
self,
d_model=768,
n_heads=12,
n_layers=12,
ff_dim=3072,
max_concepts=512, # ~8.7k tokens at avg 17 tokens/concept
dropout=0.1
):
super().__init__()
# Special concept vectors (trainable)
self.start_vec = nn.Parameter(torch.randn(768))
self.stop_vec = nn.Parameter(torch.randn(768))
self.mask_vec = nn.Parameter(torch.randn(768))
# No dimension reduction - stay at 768D throughout
self.input_norm = nn.LayerNorm(768)
# Learned positional embeddings for concepts (not tokens)
self.concept_pos_embed = nn.Parameter(torch.randn(max_concepts, 768))
# Core transformer
encoder_layer = nn.TransformerEncoderLayer(
d_model=768,
nhead=n_heads,
dim_feedforward=ff_dim,
dropout=dropout,
activation='gelu',
batch_first=True,
norm_first=True # Pre-LN for stability
)
self.transformer = nn.TransformerEncoder(encoder_layer, n_layers)
# Output projection with residual
self.output_proj = nn.Sequential(
nn.LayerNorm(768),
nn.Linear(768, 768),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(768, 768)
)
# Unit norm enforcement
self.register_buffer('eps', torch.tensor(1e-8))
def forward(self, concept_vecs, mask=None):
# concept_vecs: [B, L, 768] - already unit normalized
# Add positional embeddings
seq_len = concept_vecs.size(1)
x = concept_vecs + self.concept_pos_embed[:seq_len]
# Normalize
x = self.input_norm(x)
# Transform
x = self.transformer(x, src_key_padding_mask=mask)
# Project with residual
out = self.output_proj(x) + concept_vecs
# Enforce unit norm
out = out / (out.norm(dim=-1, keepdim=True) + self.eps)
return out
Training Architecture
python
class ConceptSequenceTrainer:
def __init__(self, model, concept_store):
self.model = model
self.concept_store = concept_store
self.gtr_t5 = GTR_T5() # Frozen
self.vec2text = Vec2Text() # Frozen
def prepare_sequence(self, text):
"""Convert text to concept sequence"""
# Dynamic segmentation based on semantic boundaries
segments = self.segment_text(text) # Returns list of text chunks
# Convert to vectors
concept_vecs = []
for seg in segments:
vec = self.gtr_t5.encode(seg)
concept_vecs.append(vec)
# Add special tokens
sequence = [self.model.start_vec] + concept_vecs + [self.model.stop_vec]
return torch.stack(sequence)
def segment_text(self, text, min_words=5, max_words=50, target_avg=17):
"""Semantic segmentation - could be learned later"""
# Use spaCy/NLTK for sentence boundaries
# Then merge/split to hit target sizes
# Respect semantic units (complete phrases/sentences)
sentences = sent_tokenize(text)
segments = []
current_segment = []
current_words = 0
for sent in sentences:
words = sent.split()
if current_words + len(words) > max_words and current_segment:
segments.append(' '.join(current_segment))
current_segment = [sent]
current_words = len(words)
else:
current_segment.append(sent)
current_words += len(words)
if current_segment:
segments.append(' '.join(current_segment))
return segments
Loss Functions
python
class CombinedLoss(nn.Module):
def __init__(self):
super().__init__()
self.cosine_weight = 0.7
self.l2_weight = 0.3
def forward(self, pred_vecs, target_vecs, mask=None):
# Primary: Next-concept prediction
shift_pred = pred_vecs[:, :-1] # [B, L-1, 768]
shift_target = target_vecs[:, 1:] # [B, L-1, 768]
# Cosine similarity loss (1 - cos_sim)
cos_sim = F.cosine_similarity(shift_pred, shift_target, dim=-1)
cos_loss = (1 - cos_sim).mean()
# L2 loss for stability
l2_loss = F.mse_loss(shift_pred, shift_target)
# Combined
loss = self.cosine_weight cos_loss + self.l2_weight l2_loss
# Optional: Contrastive loss for non-collapsed representations
if self.training:
# In-batch negatives
batch_size, seq_len = shift_pred.shape[:2]
# Reshape for contrastive
pred_flat = shift_pred.reshape(-1, 768) # [BL, 768]
target_flat = shift_target.reshape(-1, 768)
# Compute similarities to all targets
sim_matrix = torch.mm(pred_flat, target_flat.T) / 0.07 # temperature
# Contrastive loss
labels = torch.arange(len(pred_flat), device=pred_flat.device)
contrastive_loss = F.cross_entropy(sim_matrix, labels)
loss = loss + 0.2 contrastive_loss
return loss
Inference Pipeline
python
class ConceptInference:
def __init__(self, model, concept_store, vec2text):
self.model = model
self.concept_store = concept_store
self.vec2text = vec2text
self.cache_threshold = 0.85
@torch.no_grad()
def generate(self, prompt, max_concepts=50):
# Convert prompt to concepts
prompt_concepts = self.prepare_sequence(prompt)
# Add start token
generated = [self.model.start_vec]
for _ in range(max_concepts):
# Prepare input
input_seq = torch.stack(generated).unsqueeze(0) # [1, L, 768]
# Predict next
output = self.model(input_seq)
next_vec = output[0, -1] # [768]
# Check for stop
if F.cosine_similarity(next_vec, self.model.stop_vec, dim=0) > 0.9:
break
generated.append(next_vec)
# Decode to text
text_segments = []
for vec in generated[1:]: # Skip start token
text = self.decode_vector(vec)
text_segments.append(text)
return ' '.join(text_segments)
def decode_vector(self, vec):
# Try concept store first
neighbors = self.concept_store.query(vec, k=1)
if neighbors and neighbors[0].cosine > self.cache_threshold:
return neighbors[0].text
else:
# Fallback to vec2text
text = self.vec2text.decode(vec)
# Update store
self.concept_store.upsert(
vector=vec,
text=text,
source='vec2text'
)
return text
Training Configuration
python
config = {
# Model
"d_model": 768,
"n_heads": 12,
"n_layers": 6, # Smaller for M4 Mac
"ff_dim": 2048, # Reduced for memory
"max_concepts": 256, # ~4.3k tokens
# Training
"batch_size": 8, # For 128GB unified memory
"gradient_accumulation": 4, # Effective batch 32
"learning_rate": 1e-4,
"warmup_steps": 1000,
"max_steps": 50000,
# Data
"min_segment_words": 5,
"max_segment_words": 50,
"target_avg_words": 17,
# Inference
"cache_threshold": 0.85,
"max_generation_concepts": 100,
}
Key Design Decisions
- Training: Define sequence boundaries for AR loss
- Inference: Know when to stop generation
- Special handling in attention masks
Evaluation Pipelines
python
def evaluate_pipelines(test_samples):
results = []
for text in test_samples:
# Pipeline 1: Baseline
vec = gtr_t5.encode(text)
reconstructed = vec2text.decode(vec)
baseline_score = compute_metrics(text, reconstructed)
# Pipeline 2: Through model
concepts = prepare_sequence(text)
output = model.generate_from_concepts(concepts[:-1]) # Exclude stop
generated_text = decode_sequence(output)
model_score = compute_metrics(concepts[-1], output[-1]) # Vector similarity
results.append({
'baseline': baseline_score,
'model': model_score,
'text_quality': compute_text_metrics(text, generated_text)
})
This architecture is: