Egor Bulychev, source{d}.
Egor Bulychev
source{d}
def nearest_neighbors(self, origin, k=10,
skipped_stop=0.99, throw=True):
# origin can be either a text query or an id
if isinstance(origin, (tuple, list):
words, weights = origin
weights = numpy.array(weights, dtype=numpy.float32)
index = None
avg = self._get_centroid(words, weights, force=True)
else:
index = origin
words, weights = self._get_vocabulary(index)
avg = self._get_centroid_by_index(index)
if avg is None:
raise ValueError(
"Too little vocabulary for %s: %d" % (index, len(words)))
def nearest_neighbors(self, origin, k=10,
skipped_stop=0.99, throw=True):
# origin can be either a text query or an id
if isinstance(origin, (tuple, list):
words, weights = origin
weights = numpy.array(weights, dtype=numpy.float32)
index = None
avg = self._get_centroid(words, weights, force=True)
else:
index = origin
words, weights = self._get_vocabulary(index)
avg = self._get_centroid_by_index(index)
if avg is None:
raise ValueError(
"Too little vocabulary for %s: %d" % (index, len(words)))
⬤ keywords⬤ identifiers⬤ literals⬤ strings⬤ comments⬤ reserved
_tcp_socket_connect -> [tcp, socket, connect]
AuthenticationError -> [authentication, error]
set_visible -> [set, visible]
class ???:def connect(self, dbname, user, password, host, port):# ...def query(self, sql):# ...def close(self):# ...
class Database:def connect(self, dbname, user, password, host, port):# ...def query(self, sql):# ...def close(self):# ...
set_name -> [set, name]SetName -> [set, name]BUFFERFLAG_CODECCONFIG -> [bufferflag, codecconfig]metamodelength -> [metamodelength]BUFFERFLAG_CODECCONFIG -> [buffer, flag, codec, config]metamodelength -> [meta, mode, length]“send” - “receive” + “pop” = “push”
“database” - “query” + “tune” = “settings”
\( \begin{split} V_1 \Leftrightarrow & \,\texttt{"foo"} \\ \\ V_2 \Leftrightarrow & \,\texttt{"bar"} \\ \\ V_3 \Leftrightarrow & \,\texttt{"integrate"} \end{split} \)
\( distance(V_1, V_2) < distance(V_1, V_3) \)
\( distance(V_i, V_j) = \arccos \frac{V_i \cdot V_j}{\left\lVert V_i \right\rVert \left\lVert V_j \right\rVert} \)
class Database:def connect(self, user, password, host, port):self._tcp_socket_connect(host, port)try:self._authenticate(user, password)except AuthenticationError as e:self.socket.close()raise e from None
_tcp_socket_connect -> [tcp, socket, connect]
AuthenticationError -> [authentication, error]
authentication, authenticate -> authenticate
class Database:def connect(self, user, password, host, port):self._tcp_socket_connect(host, port)try:self._authenticate(user, password)except AuthenticationError as e:self.socket.close()raise e from None
database, connect2, user2, password2, host2, port2, tcp, socket2, authenticate2, error, close
class Database:def connect(self, user, password, host, port):self._tcp_socket_connect(host, port)try:self._authenticate(user, password)except AuthenticationError as e:self.socket.close()raise e from None
connect2, user2, password2, host2, port2, tcp, socket2, authenticate2, error, close
class Database:def connect(self, user, password, host, port):self._tcp_socket_connect(host, port)try:self._authenticate(user, password)except AuthenticationError as e:self.socket.close()raise e from None
connect, user, password, host, port
class Database:def connect(self, user, password, host, port):self._tcp_socket_connect(host, port)try:self._authenticate(user, password)except AuthenticationError as e:self.socket.close()raise e from None
tcp, socket, connect, host, port
class Database:def connect(self, user, password, host, port):self._tcp_socket_connect(host, port)try:self._authenticate(user, password)except AuthenticationError as e:self.socket.close()raise e from None
authenticate2, user, password, error, socket, close
class Database:def connect(self, user, password, host, port):self._tcp_socket_connect(host, port)try:self._authenticate(user, password)except AuthenticationError as e:self.socket.close()raise e from None
authenticate, user, password
class Database:def connect(self, user, password, host, port):self._tcp_socket_connect(host, port)try:self._authenticate(user, password)except AuthenticationError as e:self.socket.close()raise e from None
authenticate, error, socket, close
class Database:def connect(self, user, password, host, port):self._tcp_socket_connect(host, port)try:self._authenticate(user, password)except AuthenticationError as e:self.socket.close()raise e from None
authenticate, error
class Database:def connect(self, user, password, host, port):self._tcp_socket_connect(host, port)try:self._authenticate(user, password)except AuthenticationError as e:self.socket.close()raise e from None
socket, close
| Stage | Time | Resources | Size on disk |
|---|---|---|---|
| Cloning 143k repos | 3 days | 20x2 cores, 256 GB RAM | 2.6 TB |
| Dataset | 4 days | 20x2 cores, 256 GB RAM | 2TB (31GB in xz) |
| fastprep | 2 days | 16x2 cores, 256 GB RAM | 20 GB |
| Swivel | 14 hours | 2 Titan X'2016 + 2 1080Ti | 5.6 GB |
Let's interpret every repository as the weighted bag-of-words.
We calculate TF-IDF to weight the occurring identifiers.
tensorflow/tensorflow:
tfreturn 67.780249
oprequires 63.968142
doblas 63.714424
gputools 62.337396
tfassign 61.545926
opkernel 60.721556
sycl 57.064558
hlo 55.723587
libxsmm 54.820668
tfdisallow 53.666890WMD evaluation is O(N3), becomes slow on N≈100.
This allows to avoid 95% WMD evaluations on average.
class ClasName:@classmethoddef function_name(cls) -> str:varyable_name = "Hello, I'm ClassName object!"return varyable_name
funktion ➙ function
GetValu ➙ GetValue
str_lenght ➙ str_length
class ClassName:@classmethoddef function_name(cls) -> str:variable_name = "Hello, I'm ClassName object!"return variable_name
(identifier, token, candidate):
label=1 if the candidate is the correct suggestionlabel=0 otherwiseFrom git history
| Dev\Files | File 1 | File 2 | File 3 |
|---|---|---|---|
| Dev 1 | 0 | 0.1 | 0.8 |
| Dev 2 | 0.4 | 0.1 | 0.5 |
From head revision
| Files\Tokens | Token 1 | Token 2 | Token 3 |
|---|---|---|---|
| File 1 | 0 | 0.1 | 0.8 |
| File 2 | 0.4 | 0.1 | 0.56 |
| File 3 | 0.43 | 0.18 | 0.9 |
Result after multiplication
| Dev\Token | Token 1 | Token 2 | Token 3 |
|---|---|---|---|
| Dev 1 | 0 | 0.1 | 0.8 |
| Dev 2 | 0.4 | 0.1 | 0.5 |