genular
Gene Cell Repository: A Comprehensive Gene/Cell Database.
This is a demo database interface. Results are limited to a small (100) subset of data. For full access, please use the API or download the data dumps.
What's This All About?
Our goal is to understand how genes and cells interact in a deeper, data-driven way. To do this, we created an database packed with information about genes, proteins, and cell activity.
- Interconnected Data Insights: Each gene record links together gene expressions, disease connections, and more, all in one place.
- Detailed Gene Analysis: Explore and filter genes under specific conditions, like T-cell activity, to understand their role in different biological processes.
- Exploring Gene Pathways: Discover the pathways and networks genes are part of and how they relate to diseases or conditions.
- Detailed Expression Profiles: Access comprehensive gene expression profiles across different cell types and organisms, offering insights into gene regulation and function.
The Genular Podcast: Genes, Cells, and Discoveries
Genes
52M
Proteins
9.4M
Unique Cells
74.5M
(stats on 13 March 2024)
Document Schema
const gene = {
// Unique NCBI Gene ID - gene2accession.GeneID
geneID: { type: Number, index: { unique: true } },
// Taxonomy information
tax: {
id: { type: Number, index: true }, // Taxonomy ID - gene2accession.tax_id
name: {
name: { type: String }, // Taxonomy name - taxdump.names.dmp.name_txt
unique: { type: String },
type: { type: Number },
},
},
updated: { type: Date, default: Date.now }, // Last update timestamp
// Gene status (e.g., Predicted, Validated) - gene2accession.status
geneStatus: { type: String },
// Accession information
accession: {
rna: { type: String }, // NCBI RNA nucleotide accession number - gene2accession.RNA_nucleotide_accession.version
protein: [{ type: String }], // Array of protein accession numbers - gene2accession.protein_accession.version
gene: [{ type: String }], // Genomic nucleotide accession numbers - gene2accession.genomic_nucleotide_accession.version
peptide: { type: String }, // NCBI peptide reference sequence - gene2accession.mature_peptide_accession.version
},
// Data representing gene expression profiles across various cell types.
// cellSignificanceIndex is an array of objects, each describing per-cell statistics
cellSignificanceIndex: [
{
i: {
type: String
// The cell identifier (e.g., "CL0000236").
// Generally follows a standardized nomenclature or cell ontology ID.
},
c: {
type: String
// Context or condition the cell is in (e.g., "overall", or some ontology-based condition).
},
p: {
type: Number
// The adjusted p-value (pAdjVal) for this cell/gene expression comparison.
},
d: {
type: Number
// The delta value (deltaVal), representing the expression difference or effect size.
},
tp: {
type: Number
// Threshold for p (robust pAdjVal threshold) specific to this cell/context pairing.
},
td: {
type: Number
// Threshold for d (robust deltaVal threshold) for this cell/context pairing.
},
fcp: {
type: Number
// Fold-change ratio for the p-value relative to its threshold (pAdjVal / thresholdP).
},
fcd: {
type: Number
// Fold-change ratio for the delta value (deltaVal / thresholdD).
},
cs: {
type: Number
// A "Cell Significance Index Score" computed from pAdjVal and deltaVal
// (e.g., deltaVal * -log10(pAdjVal)) as an overall expression significance metric.
},
e: {
// Nested statistics about the expression data for this cell:
mean: { type: Number }, // Mean of raw expression values
median: { type: Number }, // Median
min: { type: Number }, // Minimum
max: { type: Number }, // Maximum
var: { type: Number }, // Variance
std: { type: Number }, // Standard Deviation
data: { type: [Number] }
// Original expression values (array of numbers).
}
}
],
// mRNA expression levels (Immune cell specificity)
mRNAExpressions: {
proteinAtlas: [
{
l: { type: String }, // lineage
c: { type: String }, // cell
e: { type: Number } // normalizedRNAExpression
}
]
},
// Cross-reference to other databases
crossReference: {
// Identifiers in other databases (database:value) except: (HGNC:HGNC:1100) - gene_info.dbXrefs
bulk: [
{
dbName: { type: String },
value: { type: String },
}
],
enseGeneID: { type: String }, // ENSEMBL gene identifier - gene2ensembl.Ensembl_gene_identifier
enseProtID: [{ type: String }], // ENSEMBL protein identifiers - gene2ensembl.Ensembl_protein_identifier
enseRnaID: [{ type: String }], // ENSEMBL RNA identifiers - gene2ensembl.Ensembl_rna_identifier
pubMed: [{ type: Number }], // PubMed references - gene2pubmed.PubMed
},
// Genomic position information
genePos: {
start: { type: Number }, // Start position on genomic sequence - gene2accession.start_position_on_the_genomic_accession
end: { type: Number }, // End position on genomic sequence - gene2accession.end_position_on_the_genomic_accession
},
orientation: { type: String }, // Gene orientation ('+', '-', or '?') - gene2accession.orientation
symbol: { type: String, index: true }, // Gene symbol, searchable - gene2accession.Symbol
locTag: { type: String }, // Locus Tag - gene_info.LocusTag
// Chromosome on which this gene is placed - gene_info.chromosome
chrom: {
pos: { type: Number },
type: { type: String }, // Chromosome type (MT or NULL)
loc: { type: String }, // Chromosome location
},
desc: { type: String }, // Gene description - gene_info.description
geneType: { type: Number }, // Gene type (e.g., Types: unknown (0), tRNA (1), rRNA (2), snRNA (3), scRNA (4), snoRNA (5), protein-coding (6), pseudo (7), transposon 8), miscRNA (9), ncRNA (10), other (255)) - gene_info.type of gene
// MIM (Mendelian Inheritance in Man) data
mim: [
{
id: { type: String, index: true }, // MIM Number (OMIM) - mim2gene_medgen.MIM number)
relation: { type: String }, // MIM type (MIM relation) (gene | phenotype) - mim2gene_medgen.type)
cui: { type: Number }, // MedGenCUI - mim2gene_medgen.MedGenCUI)
}
],
// Gene ontology data from GO and Reactome DBs
ontology: [
{
// (gene2go.GO ID or Reactome ID)
// Ontology ID (GO:0005634, R-HSA-9033241)
id: { type: String },
// (gene2go.GO term or Reactome Pathway name)
// Ontology Term (biological_process || N-terminal protein myristoylation)
term: { type: String },
// (gene2go.Category)
// Ontology category (Function, Process, or Component)
cat: { type: String },
// (gene2go.PubMed) pipe-delimited set of PubMed uids reported as evidence for the association
pubMed: [{ type: Number }],
},
],
// Gene relationships
geneRelations: [
{
// Type of gene relation - gene_group.relationship
// Ortholog, Potential readthrough sibling, Readthrough child, Readthrough parent, Readthrough sibling, Region member, Region parent, Related functional gene, Related pseudogene,
relationType: { type: String },
similarGenes: [{ type: String }], // Array of similar genes - gene_group.Other GeneID
},
],
// Gene disorders (requires omim license).
geneDisorder: [
{
name: { type: String }, // Disorder name - morbidmap.Disorder
loc: { type: String } // Cytogenetic location - morbidmap.cytogenetic location
},
],
// Protein data
protein: [
{
// GENULAR proteinID
// = geneID + protein mass + protein length + crc32(sequence)
proteinID: { type: Number, index: { unique: true } }, // , dropDups: true
// Uniprot ID - Uniprot has multiple accessions!!
// (uniprot_sprot.accession)
accession: [{ type: String, index: true }],
// Protein Name
// (uniprot_sprot.name)
symbol: { type: String },
// Protein full-descriptive name
// (uniprot_sprot.protein.recommendedName.fullName)
name: { type: String },
// Protein IDs in other Databases
databaseIDs: {
pdbID: [{ type: String }], // Protein Structure ID (idmapping_selected.PDB)
goID: [{ type: String }], // Protein GO ID (idmapping_selected.GO)
unigeneID: { type: String }, // UniGene Protein Cluster (idmapping_selected.UniGene)
interProID: [{ type: String }], // Protein InterPro ID (uniprot_???.InterPro)
Pfam: [{ type: String }], // Protein family IDs
PROSITE: [{ type: String }], // Protein domain IDs
UniGene: { type: String }, // UniGene ID for protein cluster
PDBsum: [{ type: String }], // Protein database summary IDs
ProteinModelPortal: { type: String }, // Protein model IDs
DIP: { type: String }, // Database of Interacting Proteins ID
MINT: { type: String }, // Molecular INTeraction database ID
STRING: { type: String }, // Protein-protein interaction IDs
BindingDB: { type: String }, // Protein binding data IDs
ChEMBL: { type: String }, // Chemical entities of biological interest IDs
DEPOD: { type: String }, // Dephosphorylation database ID
iPTMnet: { type: String }, // Integrated Post-Translational Modification Network ID
PhosphoSite: { type: String }, // Protein phosphorylation site IDs
SwissPalm: { type: String }, // Protein palmitoylation data IDs
UniCarbKB: { type: String }, // Unified carbohydrate knowledgebase ID
BioMuta: { type: String }, // Protein mutation data IDs
DMDM: { type: String }, // Domain Mapping of Disease Mutations ID
EPD: { type: String }, // Eukaryotic Promoter Database ID
MaxQB: { type: String }, // MaxQuant Quantitative proteomics data ID
PaxDb: { type: String }, // Protein abundance database ID
PRIDE: { type: String }, // Proteomics Identifications Database ID
GeneID: { type: String }, // NCBI Gene ID
KEGG: { type: String }, // Kyoto Encyclopedia of Genes and Genomes ID
CTD: { type: String }, // Comparative Toxicogenomics Database ID
GeneCards: { type: String }, // GeneCards ID for human genes
HPA: [{ type: String }], // Human Protein Atlas ID
MalaCards: { type: String }, // Malacards ID for human diseases
neXtProt: { type: String }, // NeXtProt ID for human proteins
PharmGKB: { type: String }, // Pharmacogenomics Knowledgebase ID
HOGENOM: { type: String }, // Homologous genes database ID
HOVERGEN: { type: String }, // Homologous vertebrate genes database ID
InParanoid: { type: String }, // Eukaryotic ortholog groups ID
KO: { type: String }, // KEGG Orthology ID
PhylomeDB: { type: String }, // Phylome database ID
TreeFam: { type: String }, // TreeFam database ID
SignaLink: { type: String }, // Signaling pathway database ID
SIGNOR: { type: String }, // SIGNOR signaling network ID
EvolutionaryTrace: { type: String }, // Evolutionary Trace Report Maker ID
GeneWiki: { type: String }, // Gene Wiki ID for gene information
GenomeRNAi: { type: String }, // GenomeRNAi database ID for RNAi data
PRO: { type: String }, // Protein Ontology ID from PRO
Proteomes: { type: String }, // UniProt Proteomes ID
Bgee: { type: String }, // Database of gene expression evolution ID
CleanEx: { type: String }, // Expression reference database ID
},
// Citations DOI's array - (dbReference id="10.1104/pp.101.4.1413" type="DOI" />)
citations: [
{
title: { type: String }, // Title of the citation
pubmedID: { type: String }, // PubMed ID associated with the citation
doi: { type: String }, // Digital Object Identifier (DOI) for the citation
scope: [{ type: String }], // Scope or context of the citation
},
],
// RefSeq Protein Similarity Indices (idmapping_selected.RefSeq)
refSeq: {
// UniRef similarity indices for RefSeq proteins
uniref: {
s50: { type: String }, // UniRef50 ID - represents cluster at 50% sequence similarity
s90: { type: String }, // UniRef90 ID - represents cluster at 90% sequence similarity
s100: { type: String }, // UniRef100 ID - represents cluster at 100% sequence similarity
},
},
// Protein Family Information (Pfam ID, Source: UniProt)
proteinFamily: {
// PFAM ID for protein family classification (pdb_pfam_mapping.PFAM_ACC)
accession: { type: String },
// Name of the protein family (pdb_pfam_mapping.PFAM_Name)
name: { type: String },
// Description of the protein family (pdb_pfam_mapping.PFAM_desc)
description: { type: String },
// Value representing the protein family (pdb_pfam_mapping.eValue)
value: { type: String },
},
// Protein Motif Details from PROSITE Database
proteinMotifs: [
{
// PROSITE internal ID for protein motifs (prosite.ID)
id: { type: String },
// Description of the protein motif pattern (prosite.DE)
description: { type: String },
// Protein motif sequence pattern (prosite.PA)
sequence: { type: String },
}
],
// Protein Interaction Partners
interactionPartners: [
{
// Partner's UniProt ID (Format: string_id.uniprot_id) from protein.links.protein2
partnerID: { type: String },
// Combined interaction score from protein.links.combined_score
score: { type: Number },
},
],
// Protein Sequence Information
sequence: {
length: { type: Number }, // Length of the protein sequence
mass: { type: Number }, // Molecular mass of the protein
checksum: { type: String }, // Checksum for sequence verification
modified: { type: Date }, // Date of last modification
version: { type: Number }, // Version of the protein sequence
sequence: { type: String }, // Amino acid sequence of the protein (uniprot_???.sprot/trembl)
},
// Protein Existence Information
existence: { type: Number }, // Protein existence type (e.g., predicted)
// Protein Source Relevance
relevance: { type: Number }, // Source of the protein data: trembl (2), sprot (1) (uniprot_???.sprot/trembl)
// UniParc ID for Sequence Database
uniParcID: { type: String }, // FASTA sequence database ID (idmapping_selected.UniParc) (e.g., UPI00003B0FD4)
}
]
};
API documentation & data access
Use API or download data directly:
-
Database dump:
Download genes_and_helpers.tar.gz dump
Once extracted it can be imported into local MongoDB instance using the following command:mongorestore --host 127.0.0.1 --port 27017 --username root --password xxx --authenticationDatabase admin /path/mongobackup
- API Interaction: Visit API
- R Package: Use genular package for integration with R. Request API key here.