# Source code for pyhpo.set

```
from pyhpo.ontology import Ontology
from pyhpo.term import HPOTerm
import warnings
[docs]class HPOSet(set):
def __init__(self, items):
set.__init__(self, items)
self._list = list(items)
def add(self, item):
"""
Overwrites ``set.add`` to ensure we keep the
``self._list`` property updated as well.
"""
if item not in self:
set.add(self, item)
self._list.append(item)
def update(self, items):
"""
Overwrites ``set.update`` to ensure we keep the
``self._list`` property updated as well.
"""
for item in items:
self.add(item)
[docs] def child_nodes(self):
"""
Return a new HPOSet tha contains only
the most specific HPO term for each subtree
It basically will return only HPO terms
that do not have descendant HPO terms
present in the set
Returns
-------
HPOSet
HPOSet instance that contains only the most specific
child nodes of the current HPOSet
"""
counter = {term.id: 0 for term in self}
for child, parent in self.combinations():
if child.child_of(parent):
counter[parent.id] += 1
return HPOSet([
term for term in self if counter[term.id] == 0
])
[docs] def remove_modifier(self):
"""
Removes all modifier terms. By default, this includes
* ``Mode of inheritance: 'HP:0000005'``
* ``Clinical modifier: 'HP:0012823'``
* ``Frequency: 'HP:0040279'``
* ``Clinical course: 'HP:0031797'``
* ``Blood group: 'HP:0032223'``
* ``Past medical history: 'HP:0032443'``
Returns
-------
HPOSet
HPOSet instance that contains only
``Phenotypic abnormality`` HPO terms
"""
return HPOSet([
term for term in self if not term.is_modifier
])
[docs] def replace_obsolete(self, verbose=False):
"""
Replaces obsolete terms with the replacement term
.. warning::
Not all obsolete terms have a replacement
Parameters
----------
verbose: bool, default: ``False``
Print warnings if an obsolete term does not have a replacement.
Returns
-------
HPOSet
A new HPOSet
"""
ids = set()
for term in self:
if term.is_obsolete:
try:
replaced = Ontology[HPOTerm.id_from_string(term.replaced_by)]
ids.add(replaced)
except AttributeError:
warnings.warn(
'The term {} is obsolete and has no replacement.'.format(term),
UserWarning)
else:
ids.add(term)
return HPOSet(ids)
[docs] def all_genes(self):
"""
Calculates the union of the genes
attached to the HPO Terms in this set
Returns
-------
set of :class:`annotations.Gene`
Set of all genes associated with the HPOTerms in the set
"""
genes = set()
for term in self:
genes.update(term.genes)
return genes
[docs] def omim_diseases(self):
"""
Calculates the union of the Omim diseases
attached to the HPO Terms in this set
Returns
-------
set of :class:`annotations.Omim`
Set of all Omim diseases associated with the HPOTerms in the set
"""
omims = set()
for term in self:
omims.update(term.omim_diseases)
return omims
def orpha_diseases(self):
"""
Calculates the union of the Omim diseases
attached to the HPO Terms in this set
Returns
-------
set of :class:`annotations.Omim`
Set of all Omim diseases associated with the HPOTerms in the set
"""
orphas = set()
for term in self:
orphas.update(term.orpha_diseases)
return orphas
def decipher_diseases(self):
"""
Calculates the union of the Omim diseases
attached to the HPO Terms in this set
Returns
-------
set of :class:`annotations.Omim`
Set of all Omim diseases associated with the HPOTerms in the set
"""
deciphers = set()
for term in self:
deciphers.update(term.decipher_diseases)
return deciphers
[docs] def information_content(self, kind=None):
"""
Gives back basic information content stats about the
HPOTerms within the set
Parameters
----------
kind: str, default: ``omim``
Which kind of information content should be calculated.
Options are ['omim', 'orpha', 'decipher', 'gene']
Returns
-------
dict
Dict with the following items
* **mean** - float - Mean information content
* **max** - float - Maximum information content value
* **total** - float - Sum of all information content values
* **all** - list of float -
List with all information content values
"""
if kind is None:
kind = 'omim'
res = {
'mean': None,
'total': 0,
'max': 0,
'all': [term.information_content[kind] for term in self]
}
res['total'] = sum(res['all'])
res['max'] = max(res['all'])
res['mean'] = res['total']/len(self)
return res
[docs] def variance(self):
"""
Calculates the distances between all its term-pairs. It also provides
basic calculations for variances among the pairs.
Returns
-------
tuple of (int, int, int, list of int)
Tuple with the variance metrices
* **int** Average distance between pairs
* **int** Smallest distance between pairs
* **int** Largest distance between pairs
* **list of int** List of all distances between pairs
"""
distances = []
for term_a, term_b in self.combinations_one_way():
distances.append(term_a.path_to_other(term_b)[0])
if len(distances):
return (
sum(distances)/len(distances),
min(distances),
max(distances),
distances
)
else:
return (0, 0, 0, [])
[docs] def combinations(self):
"""
Helper generator function that returns all possible two-pair
combination between all its terms
This function is direction dependent. That means that every
pair will appear twice. Once for each direction
.. seealso:: :func:`pyhpo.set.HPOSet.combinations_one_way`
Yields
------
Tuple of :class:`term.HPOTerm`
Tuple containing the follow items
* **HPOTerm** instance 1 of the pair
* **HPOTerm** instance 2 of the pair
Examples
--------
::
ci = HPOSet([term1, term2, term3])
ci.combinations()
# Output:
[
(term1, term2),
(term1, term3),
(term2, term1),
(term2, term3),
(term3, term1),
(term3, term2)
]
"""
for term_a in self._list:
for term_b in self._list:
if term_a == term_b:
continue
yield (term_a, term_b)
[docs] def combinations_one_way(self):
"""
Helper generator function that returns all possible two-pair
combination between all its terms
This methow will report each pair only once
.. seealso:: :func:`pyhpo.set.HPOSet.combinations`
Yields
------
Tuple of :class:`term.HPOTerm`
Tuple containing the follow items
* **HPOTerm** instance 1 of the pair
* **HPOTerm** instance 2 of the pair
Example
-------
::
ci = HPOSet([term1, term2, term3])
ci.combinations()
# Output:
[
(term1, term2),
(term1, term3),
(term2, term3)
]
"""
for i, term_a in enumerate(self._list):
for term_b in self._list[i+1:]:
yield (term_a, term_b)
[docs] def similarity(self, other, kind='omim', method=None):
"""
Calculates the similarity to another HPOSet
According to Robinson et al, American Journal of Human Genetics, (2008)
and Pesquita et al, BMC Bioinformatics, (2008)
Parameters
----------
other: HPOSet
Another HPOSet to measure the similarity to
kind: str, default ``omim``
Which kind of information content should be calculated.
Options are ['omim', 'orpha', 'decipher', 'gene']
method: string, default ``resnik``
The method to use to calculate the similarity.
Available options:
* **resnik** - Resnik P, Proceedings of the 14th IJCAI, (1995)
* **lin** - Lin D, Proceedings of the 15th ICML, (1998)
* **jc** - Jiang J, Conrath D, ROCLING X, (1997)
Implementation according to R source code
* **jc2** - Jiang J, Conrath D, ROCLING X, (1997)
Implementation according to paper from R ``hposim`` library
Deng Y, et. al., PLoS One, (2015)
* **rel** - Relevance measure - Schlicker A, et.al.,
BMC Bioinformatics, (2006)
* **ic** - Information coefficient - Li B, et. al., arXiv, (2010)
* **graphic** - Graph based Information coefficient -
Deng Y, et. al., PLoS One, (2015)
* **dist** - Distance between terms
* **equal** - Calculates exact matches between both sets
Returns
-------
float
The similarity score to the other HPOSet
"""
if method == 'equal':
return self._equality_score(other)
score1 = HPOSet._sim_score(self, other, kind, method)
score2 = HPOSet._sim_score(other, self, kind, method)
return (score1 + score2)/2
def _equality_score(self, other):
"""
Returns an equality similarity score.
Only exact matches between both sets are counted
and the fraction of exact matches is returned.
A score of 1 means both sets match 100%,
0.5 means only half the terms have an exact match.
This method does not take advantage of the ontology
and does not take distance measures into account.
Parameters
----------
other: HPOSet
Another HPOSet to measure the similarity to
Returns
-------
float
The similarity score to the other HPOSet
"""
matches = 0
for term1 in self:
for term2 in other:
if term1 == term2:
matches += 1
return matches / max([len(self), len(other)])
@staticmethod
def _sim_score(set1, set2, kind, method=None):
"""
Calculates one-way similarity from one HPOSet to another HPOSet
.. warning::
This method should not be used by itself.
Use :func:`pyhpo.set.HPOSet.similarity` instead.
Parameters
----------
set1: HPOSet
One HPOSet to measure the similarity from
set2: HPOSet
Another HPOSet to measure the similarity to
kind: str
Which kind of information content should be calculated.
Options are ['omim', 'orpha', 'decipher', 'gene']
method: string, default ``resnik``
The method to use to calculate the similarity.
Returns
-------
float
The one-way similarity from one to the other HPOSet
"""
if not len(set1) or not len(set2):
return 0
scores = []
for set1_term in set1:
scores.append(0)
for set2_term in set2:
score = set1_term.similarity_score(set2_term, kind, method)
if score > scores[-1]:
scores[-1] = score
return sum(scores)/len(scores)
@classmethod
[docs] def from_queries(cls, queries):
"""
Builds an HPO set by specifying a list of queries to run on the
:class:`pyhpo.ontology.Ontology`
Parameters
----------
queries: list of (string or int)
The queries to be run the identify the HPOTerm from the ontology
Returns
-------
:class:`pyhpo.set.HPOSet`
A new HPOset
Examples
--------
::
ci = HPOSet([
'Scoliosis',
'HP:0001234',
12
])
"""
return cls([
Ontology.get_hpo_object(query) for query in queries
])
@classmethod
[docs] def from_serialized(cls, pickle):
"""
Re-Builds an HPO set from a serialized HPOSet object
Parameters
----------
pickle: str
The serialized HPOSet object
Returns
-------
:class:`pyhpo.set.HPOSet`
A new HPOset
Examples
--------
::
ci = HPOSet(ontology, '12+24+66628')
"""
return cls([
Ontology[int(query)] for query in pickle.split('+')
])
[docs] def serialize(self):
"""
Creates a string serialization that can be used to
rebuild the same HPOSet via :func:`pyhpo.set.HPOSet.from_serialized`
Returns
-------
str
A string representation of the HPOSet
"""
ids = [str(x) for x in sorted([int(x) for x in self])]
return '+'.join(ids)
[docs] def toJSON(self, verbose=False):
"""
Creates a JSON-like object of the HPOSet
Parameters
----------
verbose: bool, default ``False``
Include extra properties of the HPOTerm
Returns
-------
list of dict
a list of HPOTerm dict objects
"""
return [t.toJSON(verbose) for t in self]
def __str__(self):
return '{}: {}'.format(
self.__class__.__name__,
', '.join([x.name for x in self])
)
def __repr__(self):
return '{}.from_serialized("{}")'.format(
self.__class__.__name__,
self.serialize()
)
[docs]class BasicHPOSet(HPOSet):
"""
Child of :class:`.HPOSet` that automatically:
* removes parent terms
* removes modifier terms
* replaces obsolete terms
"""
def __init__(self, items):
HPOSet.__init__(self, [])
for item in items:
self.add(item)
def add(self, item):
"""
Overwrites ``set.add`` to ensure we keep the
``self._list`` property updated and
don't add modifiers, obsolete or parent terms
as well
"""
if item in self:
return self
if item.is_modifier:
return self
for term in self:
if item.parent_of(term):
return self
for p in item.all_parents:
if p in self:
self.remove(p)
set.add(self, item)
self._list.append(item)
```